diff --git a/main.py b/main.py index e4d79bd..cf7a63b 100644 --- a/main.py +++ b/main.py @@ -1190,22 +1190,16 @@ def merge_output_data_aus_prospectus( ): # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") - document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="doc_date") + document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="Sheet1") # set doc_id to be string type data_df["doc_id"] = data_df["doc_id"].astype(str) document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) - """ - doc_id page_index raw_name datapoint value raw_check comment investment_type investment_id investment_name similarity - 553242368 344 Deutsche MSCI World Index Fund tor 61 33 FS0000AY1Y Xtrackers MSCI World Index Fund 0.75 - 553242368 344 db x-trackers EUR Liquid Corporate 12.5 UCITS ETF - Klasse 1C ter 0.35 1 F000018PY1 Xtrackers EUR Corporate Green Bond UCITS ETF 1C 0.462 - """ + doc_id_list = data_df["doc_id"].unique().tolist() - data_point_dict = { - "tor": "TurnoverRatio", - "ter": "NetExpenseRatio", - "ogc": "OngoingCharge", - "performance_fee": "PerformanceFee", - } + datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json" + with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f: + datapoint_keyword_config = json.load(f) + datapoint_name_list = list(datapoint_keyword_config.keys()) total_data_list = [] for doc_id in tqdm(doc_id_list): doc_data_list = [] @@ -1245,11 +1239,9 @@ def merge_output_data_aus_prospectus( "EffectiveDate": doc_date, "page_index": [], "RawName": raw_name, - "NetExpenseRatio": "", - "OngoingCharge": "", - "TurnoverRatio": "", - "PerformanceFee": "", } + for datapoint_name in datapoint_name_list: + data[datapoint_name] = "" exist_raw_name_list.append( {"raw_name": raw_name, "investment_type": investment_type} ) @@ -1260,7 +1252,7 @@ def merge_output_data_aus_prospectus( data["RawName"] == raw_name and data["investment_type"] == investment_type ): - update_key = data_point_dict[datapoint] + update_key = datapoint data[update_key] = value if page_index not in data["page_index"]: data["page_index"].append(page_index) @@ -1273,10 +1265,13 @@ def merge_output_data_aus_prospectus( if __name__ == "__main__": - # data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx" - # document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx" - # output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx" - # merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path) + data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250115185745.xlsx" + document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" + merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' + os.makedirs(merged_total_data_folder, exist_ok=True) + data_file_base_name = os.path.basename(data_file_path) + output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) + merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) # sample_document_list_folder: str = r'./sample_documents/' # document_list_file: str = "aus_prospectus.txt" @@ -1290,53 +1285,54 @@ if __name__ == "__main__": # output_mapping_child_folder=output_mapping_child_folder) # special_doc_id_list = ["553242411"] - doc_source = "emea_ar" - if doc_source == "aus_prospectus": - special_doc_id_list: list = [ - "539790009", - "542300403", - "542301117", - "542306317", - "547567013", - "552505237", - "552505278", - "554431052", - "554851189", - "555377021", - "555654388", - ] - special_doc_id_list: list = ["554851189"] - pdf_folder: str = r"/data/aus_prospectus/pdf/" - output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" - output_extract_data_child_folder: str = ( - r"/data/aus_prospectus/output/extract_data/docs/" - ) - output_extract_data_total_folder: str = ( - r"/data/aus_prospectus/output/extract_data/total/" - ) - output_mapping_child_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/docs/" - ) - output_mapping_total_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/total/" - ) - drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - batch_run_documents( - doc_source=doc_source, - special_doc_id_list=special_doc_id_list, - pdf_folder=pdf_folder, - output_pdf_text_folder=output_pdf_text_folder, - output_extract_data_child_folder=output_extract_data_child_folder, - output_extract_data_total_folder=output_extract_data_total_folder, - output_mapping_child_folder=output_mapping_child_folder, - output_mapping_total_folder=output_mapping_total_folder, - drilldown_folder=drilldown_folder, - ) - elif doc_source == "emea_ar": - special_doc_id_list = ["553242408"] - batch_run_documents( - doc_source=doc_source, special_doc_id_list=special_doc_id_list - ) + + # doc_source = "emea_ar" + # if doc_source == "aus_prospectus": + # special_doc_id_list: list = [ + # "539790009", + # "542300403", + # "542301117", + # "542306317", + # "547567013", + # "552505237", + # "552505278", + # "554431052", + # "554851189", + # "555377021", + # "555654388", + # ] + # special_doc_id_list: list = ["554851189"] + # pdf_folder: str = r"/data/aus_prospectus/pdf/" + # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + # output_extract_data_child_folder: str = ( + # r"/data/aus_prospectus/output/extract_data/docs/" + # ) + # output_extract_data_total_folder: str = ( + # r"/data/aus_prospectus/output/extract_data/total/" + # ) + # output_mapping_child_folder: str = ( + # r"/data/aus_prospectus/output/mapping_data/docs/" + # ) + # output_mapping_total_folder: str = ( + # r"/data/aus_prospectus/output/mapping_data/total/" + # ) + # drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + # batch_run_documents( + # doc_source=doc_source, + # special_doc_id_list=special_doc_id_list, + # pdf_folder=pdf_folder, + # output_pdf_text_folder=output_pdf_text_folder, + # output_extract_data_child_folder=output_extract_data_child_folder, + # output_extract_data_total_folder=output_extract_data_total_folder, + # output_mapping_child_folder=output_mapping_child_folder, + # output_mapping_total_folder=output_mapping_total_folder, + # drilldown_folder=drilldown_folder, + # ) + # elif doc_source == "emea_ar": + # special_doc_id_list = ["553242408"] + # batch_run_documents( + # doc_source=doc_source, special_doc_id_list=special_doc_id_list + # ) # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"