update a little

This commit is contained in:
Blade He 2025-02-19 14:32:08 -06:00
parent 705933bbdd
commit bb6862b179
2 changed files with 42 additions and 26 deletions

View File

@ -227,7 +227,11 @@ class DataExtraction:
if management_fee != -1: if management_fee != -1:
found = False found = False
for mf in management_fee_list: for mf in management_fee_list:
if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name: mf_fund_name = mf.get("fund_name", "")
mf_share_name = mf.get("share_name", "")
if (mf_fund_name == fund_name and mf_share_name == share_name) or \
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
mf_value = mf.get("management_fee", -1) mf_value = mf.get("management_fee", -1)
if mf_value != -1 and mf_value >= management_fee: if mf_value != -1 and mf_value >= management_fee:
mf["management_fee"] = management_fee mf["management_fee"] = management_fee
@ -242,7 +246,11 @@ class DataExtraction:
if management_fee_costs != -1: if management_fee_costs != -1:
found = False found = False
for mfc in management_fee_costs_list: for mfc in management_fee_costs_list:
if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name: mfc_fund_name = mfc.get("fund_name", "")
mfc_share_name = mfc.get("share_name", "")
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
mfc_value = mfc.get("management_fee_and_costs", -1) mfc_value = mfc.get("management_fee_and_costs", -1)
if mfc_value != -1 and mfc_value <= management_fee_costs: if mfc_value != -1 and mfc_value <= management_fee_costs:
mfc["management_fee_and_costs"] = management_fee_costs mfc["management_fee_and_costs"] = management_fee_costs

56
main.py
View File

@ -1042,8 +1042,8 @@ def batch_run_documents(
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = True
calculate_metrics = False calculate_metrics = False
@ -1272,10 +1272,11 @@ def merge_output_data_aus_prospectus(
for exist_raw_name_info in exist_raw_name_list: for exist_raw_name_info in exist_raw_name_list:
exist_raw_name = exist_raw_name_info["raw_name"] exist_raw_name = exist_raw_name_info["raw_name"]
exist_investment_type = exist_raw_name_info["investment_type"] exist_investment_type = exist_raw_name_info["investment_type"]
exist_investment_id = exist_raw_name_info["investment_id"]
if ( if (
exist_raw_name == raw_name exist_raw_name == raw_name
and exist_investment_type == investment_type and exist_investment_type == investment_type
): ) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id):
exist = True exist = True
break break
if not exist: if not exist:
@ -1295,7 +1296,7 @@ def merge_output_data_aus_prospectus(
for datapoint_name in datapoint_name_list: for datapoint_name in datapoint_name_list:
data[datapoint_name] = "" data[datapoint_name] = ""
exist_raw_name_list.append( exist_raw_name_list.append(
{"raw_name": raw_name, "investment_type": investment_type} {"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id}
) )
doc_data_list.append(data) doc_data_list.append(data)
# find data from total_data_list by raw_name # find data from total_data_list by raw_name
@ -1306,6 +1307,13 @@ def merge_output_data_aus_prospectus(
if page_index not in data["page_index"]: if page_index not in data["page_index"]:
data["page_index"].append(page_index) data["page_index"].append(page_index)
break break
if len(share_class_id) > 0 and data["sec_id"] == share_class_id:
update_key = datapoint
if len(data[update_key]) == 0:
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
break
fund_doc_data_df = data_df[ fund_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33) (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
@ -1367,13 +1375,13 @@ def merge_output_data_aus_prospectus(
if __name__ == "__main__": if __name__ == "__main__":
# test_data_extraction_metrics() # test_data_extraction_metrics()
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx" data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_17_documents_by_text_20250219123515.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx" document_mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/' merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
# os.makedirs(merged_total_data_folder, exist_ok=True) os.makedirs(merged_total_data_folder, exist_ok=True)
# data_file_base_name = os.path.basename(data_file_path) data_file_base_name = os.path.basename(data_file_path)
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
# doc_source = "aus_prospectus" # doc_source = "aus_prospectus"
# sample_document_list_folder: str = r'./sample_documents/' # sample_document_list_folder: str = r'./sample_documents/'
@ -1421,7 +1429,7 @@ if __name__ == "__main__":
# "555377021", # "555377021",
# "555654388", # "555654388",
# ] # ]
# special_doc_id_list: list = ["401212184"] # special_doc_id_list: list = ["391080133"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1438,18 +1446,18 @@ if __name__ == "__main__":
) )
drilldown_folder = r"/data/aus_prospectus/output/drilldown/" drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
batch_run_documents( # batch_run_documents(
doc_source=doc_source, # doc_source=doc_source,
special_doc_id_list=special_doc_id_list, # special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder, # pdf_folder=pdf_folder,
document_mapping_file=document_mapping_file, # document_mapping_file=document_mapping_file,
output_pdf_text_folder=output_pdf_text_folder, # output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_child_folder=output_extract_data_child_folder, # output_extract_data_child_folder=output_extract_data_child_folder,
output_extract_data_total_folder=output_extract_data_total_folder, # output_extract_data_total_folder=output_extract_data_total_folder,
output_mapping_child_folder=output_mapping_child_folder, # output_mapping_child_folder=output_mapping_child_folder,
output_mapping_total_folder=output_mapping_total_folder, # output_mapping_total_folder=output_mapping_total_folder,
drilldown_folder=drilldown_folder, # drilldown_folder=drilldown_folder,
) # )
elif doc_source == "emea_ar": elif doc_source == "emea_ar":
special_doc_id_list = [ special_doc_id_list = [
"292989214", "292989214",