Support remove duplicated values to keep the values to be the latest ones.

This commit is contained in:
Blade He 2025-02-26 17:05:58 -06:00
parent f467945cd4
commit 70079d176e
6 changed files with 162 additions and 18 deletions

View File

@ -10,6 +10,6 @@
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost", "recoverable costs", "expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
}

View File

@ -10,6 +10,6 @@
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable cost", "recoverable costs", "expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
}

View File

@ -237,11 +237,71 @@ class DataExtraction:
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
"""
data_list = self.remove_duplicate_data(data_list)
data_list = self.post_adjust_management_fee_costs(data_list)
data_list = self.supplement_minimum_initial_investment(data_list)
return data_list
def remove_duplicate_data(self, data_list: list):
"""
The purpose is to remove duplicate data in the different pages.
Reason:
1. Some pdf documents have multiple pages for the same data
2. Usually, the first data is the latest data, and the others is the older data.
3. That's why we need to remove the duplicate data in the different pages.
"""
handled_data_dict_list = []
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
raw_name = self.get_raw_name(fund_name, share_name)
dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
# sort the keys
dp_keys.sort()
additional_dp_keys = [dp_key for dp_key in dp_keys
if dp_key not in ["management_fee", "management_fee_and_costs"]]
if len(additional_dp_keys) == 0:
continue
for c_data_dict in data_list:
if c_data_dict in handled_data_dict_list:
continue
if c_data_dict == data_dict:
continue
c_extract_data = c_data_dict.get("extract_data", {})
c_data = c_extract_data.get("data", [])
remove_c_items = []
for c_data_item in c_data:
c_keys = list(c_data_item.keys())
c_fund_name = c_data_item.get("fund_name", "")
c_share_name = c_data_item.get("share_name", "")
c_raw_name = self.get_raw_name(c_fund_name, c_share_name)
if raw_name != c_raw_name:
continue
c_dp_keys = [key for key in c_keys if key not in ["fund_name", "share_name"]]
c_dp_keys.sort()
if dp_keys == c_dp_keys:
remove_c_items.append(c_data_item)
for remove_c_item in remove_c_items:
if remove_c_item in c_data:
c_data.remove(remove_c_item)
handled_data_dict_list.append(data_dict)
return data_list
def get_raw_name(self, fund_name: str, share_name: str) -> str:
raw_name = ""
if fund_name == share_name:
raw_name = fund_name
elif share_name.startswith(fund_name):
raw_name = share_name
else:
raw_name = f"{fund_name} {share_name}"
return raw_name
def post_adjust_management_fee_costs(self, data_list: list):
management_fee_costs_list = []
management_fee_list = []

View File

@ -120,6 +120,7 @@
},
"special_rule": {
"management_fee_and_costs": [
"Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.",
"If there are multiple Management fee and costs reported names, here is the priority rule:",
"A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
@ -130,11 +131,19 @@
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
"\n",
"A.2 The data value with gross and net, please ignore gross value, output the net value only.",
"---Example 2 Start---",
"---Example Start---",
"Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n",
"---Example 2 End---",
"---Example End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]}",
"\n",
"If the context only mentions the gross value or gross asset value or GAV, please ignore the gross value as the management_fee_and_costs and management_fee value, just output empty.",
"---Example Start---",
"Fees and other costs (contd) \n\n08 \n\nType of fee or cost Amount 2 How and when paid \nMANAGEMENT COSTS \nThe fees and costs for \nmanaging your investment 1 \nEstimated at 0.75625% of gross \nasset value (GAV) per annum, \ncomprising: \nThe base management fee is payable from \nthe income and assets of the Fund to the \nInvestment Manager half-yearly in arrears \nBase Management Fee \n0.50% per annum of the Average \nGAV 3 \nAnd \nExpense Recovery Costs \n0.25625% (estimated) per annum \nof GAV in other fees, expenses \nand indirect costs.",
"---Example End---",
"The output should be:",
"{\"data\": []}",
"\n",
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
"With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---",
@ -154,6 +163,7 @@
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
"\n",
"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
"---Example 1 Start---",

73
main.py
View File

@ -1042,8 +1042,8 @@ def batch_run_documents(
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
re_run_extract_data = False
re_run_mapping_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
calculate_metrics = False
@ -1377,7 +1377,7 @@ def merge_output_data_aus_prospectus(
def get_aus_prospectus_document_category():
document_sample_file = (
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
r"./sample_documents/aus_prospectus_29_documents_sample.txt"
)
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
@ -1422,30 +1422,73 @@ def get_aus_prospectus_document_category():
r"/data/aus_prospectus/output/document_category/"
)
os.makedirs(output_extract_document_category_folder, exist_ok=True)
output_file = os.path.join(output_extract_document_category_folder, "document_category.json")
output_file = os.path.join(output_extract_document_category_folder, "29_documents_category.json")
with open(output_file, "w", encoding="utf-8") as f:
json.dump(document_category_dict, f, ensure_ascii=False, indent=4)
logger.info(f"Document category: {document_category_dict}")
def test_remove_duplicate_extract_data():
doc_id = "369105359"
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (
r"/data/aus_prospectus/output/extract_data/docs/"
)
output_mapping_child_folder: str = (
r"/data/aus_prospectus/output/mapping_data/docs/"
)
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
doc_source = "aus_prospectus"
extract_way = "text"
emea_ar_parsing = EMEA_AR_Parsing(
doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_child_folder,
output_mapping_data_folder=output_mapping_child_folder,
extract_way=extract_way,
drilldown_folder=drilldown_folder,
compare_with_provider=False
)
data_extraction = DataExtraction(doc_source=emea_ar_parsing.doc_source,
doc_id=emea_ar_parsing.doc_id,
pdf_file=emea_ar_parsing.pdf_file,
output_data_folder=emea_ar_parsing.output_extract_data_folder,
page_text_dict=emea_ar_parsing.page_text_dict,
datapoint_page_info=emea_ar_parsing.datapoint_page_info,
datapoints=emea_ar_parsing.datapoints,
document_mapping_info_df=emea_ar_parsing.document_mapping_info_df,
extract_way=extract_way)
data_folder = r"/data/aus_prospectus/output/extract_data/docs/by_text/json/"
data_file = f"{doc_id}.json"
data_file_path = os.path.join(data_folder, data_file)
with open(data_file_path, "r", encoding="utf-8") as f:
data_list = json.load(f)
data_list = data_extraction.remove_duplicate_data(data_list)
if __name__ == "__main__":
# test_remove_duplicate_extract_data()
# get_aus_prospectus_document_category()
# test_data_extraction_metrics()
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_17_documents_by_text_20250219123515.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_1_documents_by_text_20250226155259.xlsx"
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
# os.makedirs(merged_total_data_folder, exist_ok=True)
# data_file_base_name = os.path.basename(data_file_path)
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
# doc_source = "aus_prospectus"
# sample_document_list_folder: str = r'./sample_documents/'
# document_list_file: str = "aus_prospectus_52_documents_sample.txt"
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
# output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
doc_source = "aus_prospectus"
sample_document_list_folder: str = r'./sample_documents/'
document_list_file: str = "aus_prospectus_29_documents_sample.txt"
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
# batch_initial_document(sample_document_list_folder=sample_document_list_folder,
# document_list_file=document_list_file,
# doc_source=doc_source,
@ -1453,6 +1496,8 @@ if __name__ == "__main__":
# output_pdf_text_folder=output_pdf_text_folder,
# output_extract_data_child_folder=output_extract_data_child_folder,
# output_mapping_child_folder=output_mapping_child_folder)
# get_aus_prospectus_document_category()
# special_doc_id_list = ["553242411"]
@ -1486,7 +1531,7 @@ if __name__ == "__main__":
# "555377021",
# "555654388",
# ]
# special_doc_id_list: list = ["411062815", "462770987", "420339794", "441280757"]
special_doc_id_list: list = ["412778803"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -0,0 +1,29 @@
530101994
550769189
550522985
539266893
539241700
539261734
550533961
506913190
539266814
521606716
553449663
528208796
539266817
521606755
557526129
540028470
531373053
544886057
557362556
557362553
520663234
527969661
541356150
555377021
523516443
539266874
539266880
526200514
526200513