diff --git a/main.py b/main.py index 68d9d78..b3132d1 100644 --- a/main.py +++ b/main.py @@ -914,7 +914,7 @@ def batch_run_documents(special_doc_id_list: list = None, ) re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_way = "text" @@ -1058,6 +1058,81 @@ def merge_output_data(data_file_path: str, total_data_df.fillna("", inplace=True) with pd.ExcelWriter(output_data_file_path) as writer: total_data_df.to_excel(writer, index=False, sheet_name="total_data") + + +def merge_output_data_aus_prospectus(data_file_path: str, + document_mapping_file: str, + output_data_file_path: str): + # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 + data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") + document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date") + # set doc_id to be string type + data_df["doc_id"] = data_df["doc_id"].astype(str) + document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) + """ + doc_id page_index raw_name datapoint value raw_check comment investment_type investment_id investment_name similarity + 553242368 344 Deutsche MSCI World Index Fund tor 61 33 FS0000AY1Y Xtrackers MSCI World Index Fund 0.75 + 553242368 344 db x-trackers EUR Liquid Corporate 12.5 UCITS ETF - Klasse 1C ter 0.35 1 F000018PY1 Xtrackers EUR Corporate Green Bond UCITS ETF 1C 0.462 + """ + doc_id_list = data_df["doc_id"].unique().tolist() + data_point_dict = { + "tor": "TurnoverRatio", + "ter": "NetExpenseRatio", + "ogc": "OngoingCharge", + "performance_fee": "PerformanceFee" + } + total_data_list = [] + for doc_id in tqdm(doc_id_list): + doc_data_list = [] + doc_data_df = data_df[data_df["doc_id"] == doc_id] + doc_date = str(document_mapping_df[document_mapping_df["DocumentId"] == doc_id]["EffectiveDate"].values[0])[0:10] + exist_raw_name_list = [] + for index, row in doc_data_df.iterrows(): + doc_id = str(row["doc_id"]) + page_index = int(row["page_index"]) + raw_name = str(row["raw_name"]) + datapoint = str(row["datapoint"]) + value = row["value"] + investment_type = row["investment_type"] + investment_id = row["investment_id"] + investment_name = row["investment_name"] + + exist = False + for exist_raw_name_info in exist_raw_name_list: + exist_raw_name = exist_raw_name_info["raw_name"] + exist_investment_type = exist_raw_name_info["investment_type"] + if exist_raw_name == raw_name and exist_investment_type == investment_type: + exist = True + break + if not exist: + data = { + "DocumentId": doc_id, + "investment_type": investment_type, + "investment_id": investment_id, + "investment_name": investment_name, + "EffectiveDate": doc_date, + "page_index": [], + "RawName": raw_name, + "NetExpenseRatio": "", + "OngoingCharge": "", + "TurnoverRatio": "", + "PerformanceFee": "" + } + exist_raw_name_list.append({"raw_name": raw_name, "investment_type": investment_type}) + doc_data_list.append(data) + # find data from total_data_list by raw_name + for data in doc_data_list: + if data["RawName"] == raw_name and data["investment_type"] == investment_type: + update_key = data_point_dict[datapoint] + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + break + total_data_list.extend(doc_data_list) + total_data_df = pd.DataFrame(total_data_list) + total_data_df.fillna("", inplace=True) + with pd.ExcelWriter(output_data_file_path) as writer: + total_data_df.to_excel(writer, index=False, sheet_name="total_data") if __name__ == "__main__": @@ -1079,19 +1154,18 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - # special_doc_id_list: list = ["539790009", - # "542300403", - # "542301117", - # "542306317", - # "547567013", - # "552505237", - # "552505278", - # "554431052", - # "554851189", - # "555377021", - # "555654388"] - special_doc_id_list: list = ["539790009", "542301117"] - special_doc_id_list: list = ["539790009"] + special_doc_id_list: list = ["539790009", + "542300403", + "542301117", + "542306317", + "547567013", + "552505237", + "552505278", + "554431052", + "554851189", + "555377021", + "555654388"] + # special_doc_id_list: list = ["539790009", "542301117"] pdf_folder:str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/" diff --git a/prepare_data.py b/prepare_data.py index 31b11f3..969e64b 100644 --- a/prepare_data.py +++ b/prepare_data.py @@ -1367,9 +1367,9 @@ def merge_aus_document_prospectus_data(): """ Merge AUS document and prospectus data. """ - aus_document_file = r"/data/aus_prospectus/basic_information/document_mapping.xlsx" - aus_prospectus_file = r"/data/aus_prospectus/basic_information/aus_prospectus_data.xlsx" - aus_document_data = pd.read_excel(aus_document_file) + aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx" + aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx" + aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping") aus_prospectus_data = pd.read_excel(aus_prospectus_file) aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str) @@ -1380,7 +1380,7 @@ def merge_aus_document_prospectus_data(): on=["FundClassId", "EffectiveDate"], how="left", ) - aus_document_prospectus_file = r"/data/aus_prospectus/aus_document_prospectus.xlsx" + aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx" with pd.ExcelWriter(aus_document_prospectus_file) as writer: aus_document_prospectus_data.to_excel( writer, sheet_name="aus_document_prospectus", index=False @@ -1393,7 +1393,7 @@ def get_pdf_2_html(): if __name__ == "__main__": - # merge_aus_document_prospectus_data() + merge_aus_document_prospectus_data() folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/" file_name = "doc_ar_data_for_emea_11_06.xlsx" # get_document_with_all_4_data_points(folder, file_name, None) @@ -1431,7 +1431,7 @@ if __name__ == "__main__": pdf_folder = r"/data/aus_prospectus/pdf/" output_folder = r"/data/aus_prospectus/pdf_txt/" - output_pdf_page_text(pdf_folder, output_folder) + # output_pdf_page_text(pdf_folder, output_folder) # extract_pdf_table(pdf_folder, output_folder) # analyze_json_error()