Support remove duplicated values to keep the values to be the latest ones.
This commit is contained in:
parent
f467945cd4
commit
70079d176e
|
|
@ -10,6 +10,6 @@
|
||||||
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
|
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
|
||||||
"benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
|
"benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
|
||||||
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
|
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
|
||||||
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
|
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost", "recoverable costs", "expense recoveries"]},
|
||||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
|
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
|
||||||
}
|
}
|
||||||
|
|
@ -10,6 +10,6 @@
|
||||||
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
|
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
|
||||||
"benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
|
"benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
|
||||||
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
|
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
|
||||||
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
|
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
||||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
|
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
|
||||||
}
|
}
|
||||||
|
|
@ -237,11 +237,71 @@ class DataExtraction:
|
||||||
data_dict["completion_token"] = result.get("completion_token", 0)
|
data_dict["completion_token"] = result.get("completion_token", 0)
|
||||||
data_dict["total_token"] = result.get("total_token", 0)
|
data_dict["total_token"] = result.get("total_token", 0)
|
||||||
"""
|
"""
|
||||||
|
data_list = self.remove_duplicate_data(data_list)
|
||||||
data_list = self.post_adjust_management_fee_costs(data_list)
|
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||||
data_list = self.supplement_minimum_initial_investment(data_list)
|
data_list = self.supplement_minimum_initial_investment(data_list)
|
||||||
|
|
||||||
return data_list
|
return data_list
|
||||||
|
|
||||||
|
def remove_duplicate_data(self, data_list: list):
|
||||||
|
"""
|
||||||
|
The purpose is to remove duplicate data in the different pages.
|
||||||
|
Reason:
|
||||||
|
1. Some pdf documents have multiple pages for the same data
|
||||||
|
2. Usually, the first data is the latest data, and the others is the older data.
|
||||||
|
3. That's why we need to remove the duplicate data in the different pages.
|
||||||
|
"""
|
||||||
|
handled_data_dict_list = []
|
||||||
|
for data_dict in data_list:
|
||||||
|
extract_data = data_dict.get("extract_data", {})
|
||||||
|
data = extract_data.get("data", [])
|
||||||
|
for data_item in data:
|
||||||
|
keys = list(data_item.keys())
|
||||||
|
fund_name = data_item.get("fund_name", "")
|
||||||
|
share_name = data_item.get("share_name", "")
|
||||||
|
raw_name = self.get_raw_name(fund_name, share_name)
|
||||||
|
dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
|
||||||
|
# sort the keys
|
||||||
|
dp_keys.sort()
|
||||||
|
additional_dp_keys = [dp_key for dp_key in dp_keys
|
||||||
|
if dp_key not in ["management_fee", "management_fee_and_costs"]]
|
||||||
|
if len(additional_dp_keys) == 0:
|
||||||
|
continue
|
||||||
|
for c_data_dict in data_list:
|
||||||
|
if c_data_dict in handled_data_dict_list:
|
||||||
|
continue
|
||||||
|
if c_data_dict == data_dict:
|
||||||
|
continue
|
||||||
|
c_extract_data = c_data_dict.get("extract_data", {})
|
||||||
|
c_data = c_extract_data.get("data", [])
|
||||||
|
remove_c_items = []
|
||||||
|
for c_data_item in c_data:
|
||||||
|
c_keys = list(c_data_item.keys())
|
||||||
|
c_fund_name = c_data_item.get("fund_name", "")
|
||||||
|
c_share_name = c_data_item.get("share_name", "")
|
||||||
|
c_raw_name = self.get_raw_name(c_fund_name, c_share_name)
|
||||||
|
if raw_name != c_raw_name:
|
||||||
|
continue
|
||||||
|
c_dp_keys = [key for key in c_keys if key not in ["fund_name", "share_name"]]
|
||||||
|
c_dp_keys.sort()
|
||||||
|
if dp_keys == c_dp_keys:
|
||||||
|
remove_c_items.append(c_data_item)
|
||||||
|
for remove_c_item in remove_c_items:
|
||||||
|
if remove_c_item in c_data:
|
||||||
|
c_data.remove(remove_c_item)
|
||||||
|
handled_data_dict_list.append(data_dict)
|
||||||
|
return data_list
|
||||||
|
|
||||||
|
def get_raw_name(self, fund_name: str, share_name: str) -> str:
|
||||||
|
raw_name = ""
|
||||||
|
if fund_name == share_name:
|
||||||
|
raw_name = fund_name
|
||||||
|
elif share_name.startswith(fund_name):
|
||||||
|
raw_name = share_name
|
||||||
|
else:
|
||||||
|
raw_name = f"{fund_name} {share_name}"
|
||||||
|
return raw_name
|
||||||
|
|
||||||
def post_adjust_management_fee_costs(self, data_list: list):
|
def post_adjust_management_fee_costs(self, data_list: list):
|
||||||
management_fee_costs_list = []
|
management_fee_costs_list = []
|
||||||
management_fee_list = []
|
management_fee_list = []
|
||||||
|
|
|
||||||
|
|
@ -120,6 +120,7 @@
|
||||||
},
|
},
|
||||||
"special_rule": {
|
"special_rule": {
|
||||||
"management_fee_and_costs": [
|
"management_fee_and_costs": [
|
||||||
|
"Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.",
|
||||||
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
"If there are multiple Management fee and costs reported names, here is the priority rule:",
|
||||||
"A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
"A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
|
||||||
|
|
||||||
|
|
@ -130,11 +131,19 @@
|
||||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
|
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
|
||||||
"\n",
|
"\n",
|
||||||
"A.2 The data value with gross and net, please ignore gross value, output the net value only.",
|
"A.2 The data value with gross and net, please ignore gross value, output the net value only.",
|
||||||
"---Example 2 Start---",
|
"---Example Start---",
|
||||||
"Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n",
|
"Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n",
|
||||||
"---Example 2 End---",
|
"---Example End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]}",
|
"{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]}",
|
||||||
|
"\n",
|
||||||
|
"If the context only mentions the gross value or gross asset value or GAV, please ignore the gross value as the management_fee_and_costs and management_fee value, just output empty.",
|
||||||
|
"---Example Start---",
|
||||||
|
"Fees and other costs (cont’d) \n\n08 \n\nType of fee or cost Amount 2 How and when paid \nMANAGEMENT COSTS \nThe fees and costs for \nmanaging your investment 1 \nEstimated at 0.75625% of gross \nasset value (GAV) per annum, \ncomprising: \nThe base management fee is payable from \nthe income and assets of the Fund to the \nInvestment Manager half-yearly in arrears \nBase Management Fee \n0.50% per annum of the Average \nGAV 3 \nAnd \nExpense Recovery Costs \n0.25625% (estimated) per annum \nof GAV in other fees, expenses \nand indirect costs.",
|
||||||
|
"---Example End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": []}",
|
||||||
|
"\n",
|
||||||
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
|
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
|
||||||
"With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
"With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
|
||||||
"---Example Start---",
|
"---Example Start---",
|
||||||
|
|
@ -154,6 +163,7 @@
|
||||||
"---Example 2 End---",
|
"---Example 2 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
||||||
|
"\n",
|
||||||
"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
|
"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
|
||||||
"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
|
"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
|
||||||
"---Example 1 Start---",
|
"---Example 1 Start---",
|
||||||
|
|
|
||||||
73
main.py
73
main.py
|
|
@ -1042,8 +1042,8 @@ def batch_run_documents(
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
)
|
)
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
|
|
@ -1377,7 +1377,7 @@ def merge_output_data_aus_prospectus(
|
||||||
|
|
||||||
def get_aus_prospectus_document_category():
|
def get_aus_prospectus_document_category():
|
||||||
document_sample_file = (
|
document_sample_file = (
|
||||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
r"./sample_documents/aus_prospectus_29_documents_sample.txt"
|
||||||
)
|
)
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||||
|
|
@ -1422,30 +1422,73 @@ def get_aus_prospectus_document_category():
|
||||||
r"/data/aus_prospectus/output/document_category/"
|
r"/data/aus_prospectus/output/document_category/"
|
||||||
)
|
)
|
||||||
os.makedirs(output_extract_document_category_folder, exist_ok=True)
|
os.makedirs(output_extract_document_category_folder, exist_ok=True)
|
||||||
output_file = os.path.join(output_extract_document_category_folder, "document_category.json")
|
output_file = os.path.join(output_extract_document_category_folder, "29_documents_category.json")
|
||||||
with open(output_file, "w", encoding="utf-8") as f:
|
with open(output_file, "w", encoding="utf-8") as f:
|
||||||
json.dump(document_category_dict, f, ensure_ascii=False, indent=4)
|
json.dump(document_category_dict, f, ensure_ascii=False, indent=4)
|
||||||
logger.info(f"Document category: {document_category_dict}")
|
logger.info(f"Document category: {document_category_dict}")
|
||||||
|
|
||||||
|
|
||||||
|
def test_remove_duplicate_extract_data():
|
||||||
|
doc_id = "369105359"
|
||||||
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
|
output_extract_data_child_folder: str = (
|
||||||
|
r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
|
)
|
||||||
|
output_mapping_child_folder: str = (
|
||||||
|
r"/data/aus_prospectus/output/mapping_data/docs/"
|
||||||
|
)
|
||||||
|
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
|
||||||
|
doc_source = "aus_prospectus"
|
||||||
|
extract_way = "text"
|
||||||
|
emea_ar_parsing = EMEA_AR_Parsing(
|
||||||
|
doc_id,
|
||||||
|
doc_source=doc_source,
|
||||||
|
pdf_folder=pdf_folder,
|
||||||
|
output_pdf_text_folder=output_pdf_text_folder,
|
||||||
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
|
output_mapping_data_folder=output_mapping_child_folder,
|
||||||
|
extract_way=extract_way,
|
||||||
|
drilldown_folder=drilldown_folder,
|
||||||
|
compare_with_provider=False
|
||||||
|
)
|
||||||
|
data_extraction = DataExtraction(doc_source=emea_ar_parsing.doc_source,
|
||||||
|
doc_id=emea_ar_parsing.doc_id,
|
||||||
|
pdf_file=emea_ar_parsing.pdf_file,
|
||||||
|
output_data_folder=emea_ar_parsing.output_extract_data_folder,
|
||||||
|
page_text_dict=emea_ar_parsing.page_text_dict,
|
||||||
|
datapoint_page_info=emea_ar_parsing.datapoint_page_info,
|
||||||
|
datapoints=emea_ar_parsing.datapoints,
|
||||||
|
document_mapping_info_df=emea_ar_parsing.document_mapping_info_df,
|
||||||
|
extract_way=extract_way)
|
||||||
|
data_folder = r"/data/aus_prospectus/output/extract_data/docs/by_text/json/"
|
||||||
|
|
||||||
|
data_file = f"{doc_id}.json"
|
||||||
|
data_file_path = os.path.join(data_folder, data_file)
|
||||||
|
with open(data_file_path, "r", encoding="utf-8") as f:
|
||||||
|
data_list = json.load(f)
|
||||||
|
data_list = data_extraction.remove_duplicate_data(data_list)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
# test_remove_duplicate_extract_data()
|
||||||
# get_aus_prospectus_document_category()
|
# get_aus_prospectus_document_category()
|
||||||
# test_data_extraction_metrics()
|
# test_data_extraction_metrics()
|
||||||
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_17_documents_by_text_20250219123515.xlsx"
|
# data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_1_documents_by_text_20250226155259.xlsx"
|
||||||
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
# document_mapping_file_path = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||||
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
# merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
|
||||||
# os.makedirs(merged_total_data_folder, exist_ok=True)
|
# os.makedirs(merged_total_data_folder, exist_ok=True)
|
||||||
# data_file_base_name = os.path.basename(data_file_path)
|
# data_file_base_name = os.path.basename(data_file_path)
|
||||||
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
# output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
|
||||||
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
|
# merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
|
||||||
|
|
||||||
# doc_source = "aus_prospectus"
|
doc_source = "aus_prospectus"
|
||||||
# sample_document_list_folder: str = r'./sample_documents/'
|
sample_document_list_folder: str = r'./sample_documents/'
|
||||||
# document_list_file: str = "aus_prospectus_52_documents_sample.txt"
|
document_list_file: str = "aus_prospectus_29_documents_sample.txt"
|
||||||
# pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
# output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
# output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
|
||||||
# output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
|
output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
|
||||||
# batch_initial_document(sample_document_list_folder=sample_document_list_folder,
|
# batch_initial_document(sample_document_list_folder=sample_document_list_folder,
|
||||||
# document_list_file=document_list_file,
|
# document_list_file=document_list_file,
|
||||||
# doc_source=doc_source,
|
# doc_source=doc_source,
|
||||||
|
|
@ -1453,6 +1496,8 @@ if __name__ == "__main__":
|
||||||
# output_pdf_text_folder=output_pdf_text_folder,
|
# output_pdf_text_folder=output_pdf_text_folder,
|
||||||
# output_extract_data_child_folder=output_extract_data_child_folder,
|
# output_extract_data_child_folder=output_extract_data_child_folder,
|
||||||
# output_mapping_child_folder=output_mapping_child_folder)
|
# output_mapping_child_folder=output_mapping_child_folder)
|
||||||
|
|
||||||
|
# get_aus_prospectus_document_category()
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
|
|
@ -1486,7 +1531,7 @@ if __name__ == "__main__":
|
||||||
# "555377021",
|
# "555377021",
|
||||||
# "555654388",
|
# "555654388",
|
||||||
# ]
|
# ]
|
||||||
# special_doc_id_list: list = ["411062815", "462770987", "420339794", "441280757"]
|
special_doc_id_list: list = ["412778803"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
530101994
|
||||||
|
550769189
|
||||||
|
550522985
|
||||||
|
539266893
|
||||||
|
539241700
|
||||||
|
539261734
|
||||||
|
550533961
|
||||||
|
506913190
|
||||||
|
539266814
|
||||||
|
521606716
|
||||||
|
553449663
|
||||||
|
528208796
|
||||||
|
539266817
|
||||||
|
521606755
|
||||||
|
557526129
|
||||||
|
540028470
|
||||||
|
531373053
|
||||||
|
544886057
|
||||||
|
557362556
|
||||||
|
557362553
|
||||||
|
520663234
|
||||||
|
527969661
|
||||||
|
541356150
|
||||||
|
555377021
|
||||||
|
523516443
|
||||||
|
539266874
|
||||||
|
539266880
|
||||||
|
526200514
|
||||||
|
526200513
|
||||||
Loading…
Reference in New Issue