update a little

This commit is contained in:
Blade He 2025-02-19 14:32:08 -06:00
parent 705933bbdd
commit bb6862b179
2 changed files with 42 additions and 26 deletions

View File

@ -227,7 +227,11 @@ class DataExtraction:
if management_fee != -1:
found = False
for mf in management_fee_list:
if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
mf_fund_name = mf.get("fund_name", "")
mf_share_name = mf.get("share_name", "")
if (mf_fund_name == fund_name and mf_share_name == share_name) or \
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
mf_value = mf.get("management_fee", -1)
if mf_value != -1 and mf_value >= management_fee:
mf["management_fee"] = management_fee
@ -242,7 +246,11 @@ class DataExtraction:
if management_fee_costs != -1:
found = False
for mfc in management_fee_costs_list:
if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
mfc_fund_name = mfc.get("fund_name", "")
mfc_share_name = mfc.get("share_name", "")
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
mfc_value = mfc.get("management_fee_and_costs", -1)
if mfc_value != -1 and mfc_value <= management_fee_costs:
mfc["management_fee_and_costs"] = management_fee_costs

56
main.py
View File

@ -1042,8 +1042,8 @@ def batch_run_documents(
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
re_run_extract_data = False
re_run_mapping_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
calculate_metrics = False
@ -1272,10 +1272,11 @@ def merge_output_data_aus_prospectus(
for exist_raw_name_info in exist_raw_name_list:
exist_raw_name = exist_raw_name_info["raw_name"]
exist_investment_type = exist_raw_name_info["investment_type"]
exist_investment_id = exist_raw_name_info["investment_id"]
if (
exist_raw_name == raw_name
and exist_investment_type == investment_type
):
) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id):
exist = True
break
if not exist:
@ -1295,7 +1296,7 @@ def merge_output_data_aus_prospectus(
for datapoint_name in datapoint_name_list:
data[datapoint_name] = ""
exist_raw_name_list.append(
{"raw_name": raw_name, "investment_type": investment_type}
{"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id}
)
doc_data_list.append(data)
# find data from total_data_list by raw_name
@ -1306,6 +1307,13 @@ def merge_output_data_aus_prospectus(
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
break
if len(share_class_id) > 0 and data["sec_id"] == share_class_id:
update_key = datapoint
if len(data[update_key]) == 0:
data[update_key] = value
if page_index not in data["page_index"]:
data["page_index"].append(page_index)
break
fund_doc_data_df = data_df[
(data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
@ -1367,13 +1375,13 @@ def merge_output_data_aus_prospectus(
if __name__ == "__main__":
# test_data_extraction_metrics()
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
# os.makedirs(merged_total_data_folder, exist_ok=True)
# data_file_base_name = os.path.basename(data_file_path)
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_17_documents_by_text_20250219123515.xlsx"
document_mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
os.makedirs(merged_total_data_folder, exist_ok=True)
data_file_base_name = os.path.basename(data_file_path)
output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
# doc_source = "aus_prospectus"
# sample_document_list_folder: str = r'./sample_documents/'
@ -1421,7 +1429,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["401212184"]
# special_doc_id_list: list = ["391080133"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (
@ -1438,18 +1446,18 @@ if __name__ == "__main__":
)
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
batch_run_documents(
doc_source=doc_source,
special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder,
document_mapping_file=document_mapping_file,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_child_folder=output_extract_data_child_folder,
output_extract_data_total_folder=output_extract_data_total_folder,
output_mapping_child_folder=output_mapping_child_folder,
output_mapping_total_folder=output_mapping_total_folder,
drilldown_folder=drilldown_folder,
)
# batch_run_documents(
# doc_source=doc_source,
# special_doc_id_list=special_doc_id_list,
# pdf_folder=pdf_folder,
# document_mapping_file=document_mapping_file,
# output_pdf_text_folder=output_pdf_text_folder,
# output_extract_data_child_folder=output_extract_data_child_folder,
# output_extract_data_total_folder=output_extract_data_total_folder,
# output_mapping_child_folder=output_mapping_child_folder,
# output_mapping_total_folder=output_mapping_total_folder,
# drilldown_folder=drilldown_folder,
# )
elif doc_source == "emea_ar":
special_doc_id_list = [
"292989214",