From 41f8c307ff010e303cee9f6e014bb431ac206c3c Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 27 Jan 2025 12:32:36 -0600 Subject: [PATCH] a little change --- main.py | 147 ++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 115 insertions(+), 32 deletions(-) diff --git a/main.py b/main.py index 7b41d6f..bf07834 100644 --- a/main.py +++ b/main.py @@ -31,11 +31,14 @@ class EMEA_AR_Parsing: output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", extract_way: str = "text", drilldown_folder: str = r"/data/emea_ar/output/drilldown/", + compare_with_provider: bool = True ) -> None: self.doc_id = doc_id self.doc_source = doc_source self.pdf_folder = pdf_folder os.makedirs(self.pdf_folder, exist_ok=True) + self.compare_with_provider = compare_with_provider + self.pdf_file = self.download_pdf() self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) @@ -72,11 +75,11 @@ class EMEA_AR_Parsing: os.makedirs(self.output_mapping_data_folder, exist_ok=True) self.filter_pages = FilterPages( - self.doc_id, - self.pdf_file, - self.document_mapping_info_df, + self.doc_id, + self.pdf_file, + self.document_mapping_info_df, self.doc_source, - output_pdf_text_folder + output_pdf_text_folder, ) self.page_text_dict = self.filter_pages.page_text_dict @@ -87,7 +90,9 @@ class EMEA_AR_Parsing: drilldown_folder = r"/data/emea_ar/output/drilldown/" os.makedirs(drilldown_folder, exist_ok=True) self.drilldown_folder = drilldown_folder - misc_config_file = os.path.join(f"./configuration/{doc_source}/", "misc_config.json") + misc_config_file = os.path.join( + f"./configuration/{doc_source}/", "misc_config.json" + ) if os.path.exists(misc_config_file): with open(misc_config_file, "r", encoding="utf-8") as f: misc_config = json.load(f) @@ -278,7 +283,8 @@ class EMEA_AR_Parsing: data_from_gpt, self.document_mapping_info_df, self.output_mapping_data_folder, - self.doc_source + self.doc_source, + compare_with_provider=self.compare_with_provider ) return data_mapping.mapping_raw_data_entrance() @@ -334,6 +340,7 @@ def mapping_data( output_mapping_data_folder=output_mapping_folder, extract_way=extract_way, drilldown_folder=drilldown_folder, + compare_with_provider=False ) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data( re_run=re_run_extract_data @@ -501,19 +508,29 @@ def batch_start_job( result_extract_data_df.to_excel( writer, index=False, sheet_name="extract_data" ) - - if document_mapping_file is not None and len(document_mapping_file) > 0 and os.path.exists(document_mapping_file): + + if ( + document_mapping_file is not None + and len(document_mapping_file) > 0 + and os.path.exists(document_mapping_file) + ): try: - merged_total_data_folder = os.path.join(output_mapping_total_folder, "merged/") + merged_total_data_folder = os.path.join( + output_mapping_total_folder, "merged/" + ) os.makedirs(merged_total_data_folder, exist_ok=True) data_file_base_name = os.path.basename(output_file) - output_merged_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) - merge_output_data_aus_prospectus(output_file, document_mapping_file, output_merged_data_file_path) + output_merged_data_file_path = os.path.join( + merged_total_data_folder, "merged_" + data_file_base_name + ) + merge_output_data_aus_prospectus( + output_file, document_mapping_file, output_merged_data_file_path + ) except Exception as e: logger.error(f"Error: {e}") if calculate_metrics: - prediction_sheet_name = "total_mapping_data" + prediction_sheet_name = "data_in_doc_mapping" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_sheet_name = "mapping_data" metrics_output_folder = r"/data/emea_ar/output/metrics/" @@ -770,11 +787,11 @@ def test_auto_generate_instructions(): def test_data_extraction_metrics(): - data_type = "data_extraction" + data_type = "document_mapping_in_db" # prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx" - prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240922152517.xlsx" + prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_51_documents_by_text_20250127104008.xlsx" # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/481475385.xlsx" - prediction_sheet_name = "mapping_data" + prediction_sheet_name = "data_in_doc_mapping" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_sheet_name = "mapping_data" metrics_output_folder = r"/data/emea_ar/output/metrics/" @@ -1015,7 +1032,7 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = True + re_run_extract_data = False re_run_mapping_data = True force_save_total_data = False calculate_metrics = False @@ -1194,13 +1211,17 @@ def merge_output_data_aus_prospectus( ): # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16 data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data") - document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping") + document_mapping_df = pd.read_excel( + document_mapping_file, sheet_name="document_mapping" + ) # set doc_id to be string type data_df["doc_id"] = data_df["doc_id"].astype(str) document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str) doc_id_list = data_df["doc_id"].unique().tolist() - datapoint_keyword_config_file = r"./configuration/aus_prospectus/datapoint_name.json" + datapoint_keyword_config_file = ( + r"./configuration/aus_prospectus/datapoint_name.json" + ) with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f: datapoint_keyword_config = json.load(f) datapoint_name_list = list(datapoint_keyword_config.keys()) @@ -1212,7 +1233,9 @@ def merge_output_data_aus_prospectus( "EffectiveDate" ].values[0] )[0:10] - share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)] + share_doc_data_df = data_df[ + (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1) + ] exist_raw_name_list = [] for index, row in share_doc_data_df.iterrows(): doc_id = str(row["doc_id"]) @@ -1228,7 +1251,9 @@ def merge_output_data_aus_prospectus( fund_id = "" fund_legal_name = "" if share_class_id != "": - record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id] + record_row = document_mapping_df[ + document_mapping_df["FundClassId"] == share_class_id + ] if len(record_row) > 0: fund_id = record_row["FundId"].values[0] fund_legal_name = record_row["FundLegalName"].values[0] @@ -1265,16 +1290,16 @@ def merge_output_data_aus_prospectus( doc_data_list.append(data) # find data from total_data_list by raw_name for data in doc_data_list: - if ( - data["raw_name"] == raw_name - ): + if data["raw_name"] == raw_name: update_key = datapoint data[update_key] = value if page_index not in data["page_index"]: data["page_index"].append(page_index) break - - fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)] + + fund_doc_data_df = data_df[ + (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33) + ] for index, row in fund_doc_data_df.iterrows(): doc_id = str(row["doc_id"]) page_index = int(row["page_index"]) @@ -1285,12 +1310,13 @@ def merge_output_data_aus_prospectus( value = row["value"] fund_id = row["investment_id"] fund_legal_name = row["investment_name"] - + exist = False if fund_id != "": for data in doc_data_list: - if (fund_id != "" and data["fund_id"] == fund_id) or \ - (data["raw_fund_name"] == raw_fund_name): + if (fund_id != "" and data["fund_id"] == fund_id) or ( + data["raw_fund_name"] == raw_fund_name + ): update_key = datapoint data[update_key] = value if page_index not in data["page_index"]: @@ -1323,6 +1349,7 @@ def merge_output_data_aus_prospectus( if __name__ == "__main__": + # test_data_extraction_metrics() # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' @@ -1347,10 +1374,12 @@ if __name__ == "__main__": # output_mapping_child_folder=output_mapping_child_folder) # special_doc_id_list = ["553242411"] - + doc_source = "emea_ar" if doc_source == "aus_prospectus": - document_sample_file = r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + document_sample_file = ( + r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt" + ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" @@ -1383,7 +1412,7 @@ if __name__ == "__main__": r"/data/aus_prospectus/output/mapping_data/total/" ) drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - + batch_run_documents( doc_source=doc_source, special_doc_id_list=special_doc_id_list, @@ -1397,7 +1426,61 @@ if __name__ == "__main__": drilldown_folder=drilldown_folder, ) elif doc_source == "emea_ar": - special_doc_id_list = ["553242408"] + special_doc_id_list = [ + "292989214", + "316237292", + "321733631", + "323390570", + "327956364", + "333207452", + "334718372", + "344636875", + "362246081", + "366179419", + "380945052", + "382366116", + "387202452", + "389171486", + "391456740", + "391736837", + "394778487", + "401684600", + "402113224", + "402181770", + "402397014", + "405803396", + "445102363", + "445256897", + "448265376", + "449555622", + "449623976", + "458291624", + "458359181", + "463081566", + "469138353", + "471641628", + "476492237", + "478585901", + "478586066", + "479042264", + "479793787", + "481475385", + "483617247", + "486378555", + "486383912", + "492121213", + "497497599", + "502693599", + "502821436", + "503194284", + "506559375", + "507967525", + "508854243", + "509845549", + "520879048", + "529925114", + ] + special_doc_id_list = ["471641628"] batch_run_documents( doc_source=doc_source, special_doc_id_list=special_doc_id_list )