diff --git a/core/data_extraction.py b/core/data_extraction.py index c153431..258274a 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -227,7 +227,11 @@ class DataExtraction: if management_fee != -1: found = False for mf in management_fee_list: - if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name: + mf_fund_name = mf.get("fund_name", "") + mf_share_name = mf.get("share_name", "") + if (mf_fund_name == fund_name and mf_share_name == share_name) or \ + (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and + (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): mf_value = mf.get("management_fee", -1) if mf_value != -1 and mf_value >= management_fee: mf["management_fee"] = management_fee @@ -242,7 +246,11 @@ class DataExtraction: if management_fee_costs != -1: found = False for mfc in management_fee_costs_list: - if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name: + mfc_fund_name = mfc.get("fund_name", "") + mfc_share_name = mfc.get("share_name", "") + if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \ + (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and + (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))): mfc_value = mfc.get("management_fee_and_costs", -1) if mfc_value != -1 and mfc_value <= management_fee_costs: mfc["management_fee_and_costs"] = management_fee_costs diff --git a/main.py b/main.py index 8664504..d2d3208 100644 --- a/main.py +++ b/main.py @@ -1042,8 +1042,8 @@ def batch_run_documents( page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = True calculate_metrics = False @@ -1272,10 +1272,11 @@ def merge_output_data_aus_prospectus( for exist_raw_name_info in exist_raw_name_list: exist_raw_name = exist_raw_name_info["raw_name"] exist_investment_type = exist_raw_name_info["investment_type"] + exist_investment_id = exist_raw_name_info["investment_id"] if ( exist_raw_name == raw_name and exist_investment_type == investment_type - ): + ) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id): exist = True break if not exist: @@ -1295,7 +1296,7 @@ def merge_output_data_aus_prospectus( for datapoint_name in datapoint_name_list: data[datapoint_name] = "" exist_raw_name_list.append( - {"raw_name": raw_name, "investment_type": investment_type} + {"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id} ) doc_data_list.append(data) # find data from total_data_list by raw_name @@ -1306,6 +1307,13 @@ def merge_output_data_aus_prospectus( if page_index not in data["page_index"]: data["page_index"].append(page_index) break + if len(share_class_id) > 0 and data["sec_id"] == share_class_id: + update_key = datapoint + if len(data[update_key]) == 0: + data[update_key] = value + if page_index not in data["page_index"]: + data["page_index"].append(page_index) + break fund_doc_data_df = data_df[ (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33) @@ -1367,13 +1375,13 @@ def merge_output_data_aus_prospectus( if __name__ == "__main__": # test_data_extraction_metrics() - # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" - # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" - # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' - # os.makedirs(merged_total_data_folder, exist_ok=True) - # data_file_base_name = os.path.basename(data_file_path) - # output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) - # merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) + data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_17_documents_by_text_20250219123515.xlsx" + document_mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" + merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' + os.makedirs(merged_total_data_folder, exist_ok=True) + data_file_base_name = os.path.basename(data_file_path) + output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) + merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) # doc_source = "aus_prospectus" # sample_document_list_folder: str = r'./sample_documents/' @@ -1421,7 +1429,7 @@ if __name__ == "__main__": # "555377021", # "555654388", # ] - # special_doc_id_list: list = ["401212184"] + # special_doc_id_list: list = ["391080133"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1438,18 +1446,18 @@ if __name__ == "__main__": ) drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - batch_run_documents( - doc_source=doc_source, - special_doc_id_list=special_doc_id_list, - pdf_folder=pdf_folder, - document_mapping_file=document_mapping_file, - output_pdf_text_folder=output_pdf_text_folder, - output_extract_data_child_folder=output_extract_data_child_folder, - output_extract_data_total_folder=output_extract_data_total_folder, - output_mapping_child_folder=output_mapping_child_folder, - output_mapping_total_folder=output_mapping_total_folder, - drilldown_folder=drilldown_folder, - ) + # batch_run_documents( + # doc_source=doc_source, + # special_doc_id_list=special_doc_id_list, + # pdf_folder=pdf_folder, + # document_mapping_file=document_mapping_file, + # output_pdf_text_folder=output_pdf_text_folder, + # output_extract_data_child_folder=output_extract_data_child_folder, + # output_extract_data_total_folder=output_extract_data_total_folder, + # output_mapping_child_folder=output_mapping_child_folder, + # output_mapping_total_folder=output_mapping_total_folder, + # drilldown_folder=drilldown_folder, + # ) elif doc_source == "emea_ar": special_doc_id_list = [ "292989214",