From b93a8d55e84d7b61845458b1e7a9c38456110797 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 17 Jan 2025 11:41:58 -0600 Subject: [PATCH] update for output data as template --- main.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 67 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index cf7a63b..738c211 100644 --- a/main.py +++ b/main.py @@ -1017,7 +1017,7 @@ def batch_run_documents( ) re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = False + force_save_total_data = True calculate_metrics = False extract_way = "text" @@ -1203,22 +1203,31 @@ def merge_output_data_aus_prospectus( total_data_list = [] for doc_id in tqdm(doc_id_list): doc_data_list = [] - doc_data_df = data_df[data_df["doc_id"] == doc_id] doc_date = str( document_mapping_df[document_mapping_df["DocumentId"] == doc_id][ "EffectiveDate" ].values[0] )[0:10] + share_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 1)] exist_raw_name_list = [] - for index, row in doc_data_df.iterrows(): + for index, row in share_doc_data_df.iterrows(): doc_id = str(row["doc_id"]) page_index = int(row["page_index"]) + raw_fund_name = str(row["raw_fund_name"]) + raw_share_name = str(row["raw_share_name"]) raw_name = str(row["raw_name"]) datapoint = str(row["datapoint"]) value = row["value"] investment_type = row["investment_type"] - investment_id = row["investment_id"] - investment_name = row["investment_name"] + share_class_id = row["investment_id"] + share_class_legal_name = row["investment_name"] + fund_id = "" + fund_legal_name = "" + if share_class_id != "": + record_row = document_mapping_df[document_mapping_df["FundClassId"] == share_class_id] + if len(record_row) > 0: + fund_id = record_row["FundId"].values[0] + fund_legal_name = record_row["FundLegalName"].values[0] exist = False for exist_raw_name_info in exist_raw_name_list: @@ -1233,9 +1242,13 @@ def merge_output_data_aus_prospectus( if not exist: data = { "DocumentId": doc_id, - "investment_type": investment_type, - "investment_id": investment_id, - "investment_name": investment_name, + "raw_fund_name": raw_fund_name, + "raw_share_name": raw_share_name, + "raw_name": raw_name, + "fund_id": fund_id, + "fund_name": fund_legal_name, + "sec_id": share_class_id, + "sec_name": share_class_legal_name, "EffectiveDate": doc_date, "page_index": [], "RawName": raw_name, @@ -1249,14 +1262,55 @@ def merge_output_data_aus_prospectus( # find data from total_data_list by raw_name for data in doc_data_list: if ( - data["RawName"] == raw_name - and data["investment_type"] == investment_type + data["raw_name"] == raw_name ): update_key = datapoint data[update_key] = value if page_index not in data["page_index"]: data["page_index"].append(page_index) break + + fund_doc_data_df = data_df[(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)] + for index, row in fund_doc_data_df.iterrows(): + doc_id = str(row["doc_id"]) + page_index = int(row["page_index"]) + raw_fund_name = str(row["raw_fund_name"]) + raw_share_name = "" + raw_name = str(row["raw_name"]) + datapoint = str(row["datapoint"]) + value = row["value"] + fund_id = row["investment_id"] + fund_legal_name = row["investment_name"] + + exist = False + if fund_id != "": + for data in doc_data_list: + if (fund_id != "" and data["fund_id"] == fund_id) or \ + (data["raw_fund_name"] == raw_fund_name): + update_key = datapoint + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + exist = True + + if not exist: + data = { + "DocumentId": doc_id, + "raw_fund_name": raw_fund_name, + "raw_share_name": "", + "raw_name": raw_name, + "fund_id": fund_id, + "fund_name": fund_legal_name, + "sec_id": "", + "sec_name": "", + "EffectiveDate": doc_date, + "page_index": [page_index], + "RawName": raw_name, + } + for datapoint_name in datapoint_name_list: + data[datapoint_name] = "" + data[datapoint] = value + doc_data_list.append(data) total_data_list.extend(doc_data_list) total_data_df = pd.DataFrame(total_data_list) total_data_df.fillna("", inplace=True) @@ -1265,7 +1319,7 @@ def merge_output_data_aus_prospectus( if __name__ == "__main__": - data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250115185745.xlsx" + data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' os.makedirs(merged_total_data_folder, exist_ok=True) @@ -1286,7 +1340,7 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - # doc_source = "emea_ar" + # doc_source = "aus_prospectus" # if doc_source == "aus_prospectus": # special_doc_id_list: list = [ # "539790009", @@ -1301,7 +1355,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - # special_doc_id_list: list = ["554851189"] + # # special_doc_id_list: list = ["554851189"] # pdf_folder: str = r"/data/aus_prospectus/pdf/" # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" # output_extract_data_child_folder: str = (