apply configuration file to replace disorder table header contents

This commit is contained in:
Blade He 2025-03-10 11:09:00 -05:00
parent 2548606ccc
commit e9f6383258
6 changed files with 69 additions and 43 deletions

View File

@ -1308,6 +1308,10 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
def is_equal(gt_value, pred_value, data_point: str = ""): def is_equal(gt_value, pred_value, data_point: str = ""):
if gt_value is not None and len(str(gt_value)) > 0 and \ if gt_value is not None and len(str(gt_value)) > 0 and \
pred_value is not None and len(str(pred_value)) > 0: pred_value is not None and len(str(pred_value)) > 0:
if gt_value == "0.0":
gt_value = "0"
if pred_value == "0.0":
pred_value = "0"
if gt_value == pred_value: if gt_value == pred_value:
return True return True
if data_point == "benchmark_name": if data_point == "benchmark_name":
@ -1351,7 +1355,7 @@ if __name__ == "__main__":
verify_document_list_file_list = [None, verify_document_list_file_list = [None,
"./sample_documents/aus_prospectus_29_documents_sample.txt", "./sample_documents/aus_prospectus_29_documents_sample.txt",
"./sample_documents/aus_prospectus_17_documents_sample.txt"] "./sample_documents/aus_prospectus_17_documents_sample.txt"]
is_for_all = False is_for_all = True
for verify_document_list_file in verify_document_list_file_list: for verify_document_list_file in verify_document_list_file_list:
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path, calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
audit_data_sheet=audit_data_sheet, audit_data_sheet=audit_data_sheet,

View File

@ -0,0 +1,35 @@
{
"details": [
{
"regex_all_list":
["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n",
"\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n",
"\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
"comments": ["item 0: document 410899007",
"item 1: document 539266880, 539266817, 539261734",
"item 2: document 539266893"]
},
{
"regex_all_list":
["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
"comments": ["item 0: document 410899007"]
},
{
"regex_all_list":
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
"comments": ["item 0: document 401212184, page 17",
"item 1: document 401212184, page 18 - 20"]
},
{
"regex_all_list":
["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
"comments": ["item 0: document 411062815, page 17"]
}
]
}

View File

@ -0,0 +1,3 @@
{
"details": []
}

View File

@ -74,6 +74,7 @@ class DataExtraction:
self.datapoint_level_config = self.get_datapoint_level() self.datapoint_level_config = self.get_datapoint_level()
self.datapoint_type_config = self.get_datapoint_type() self.datapoint_type_config = self.get_datapoint_type()
self.datapoint_name_config = self.get_datapoint_name() self.datapoint_name_config = self.get_datapoint_name()
self.replace_table_header_config = self.get_replace_table_header_config()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \ self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name() self.get_datapoint_reported_name()
self.extract_way = extract_way self.extract_way = extract_way
@ -206,6 +207,15 @@ class DataExtraction:
datapoint_name = json.load(f) datapoint_name = json.load(f)
return datapoint_name return datapoint_name
def get_replace_table_header_config(self) -> str:
replace_table_header_file = os.path.join(self.configuration_folder, "replace_table_header.json")
if os.path.exists(replace_table_header_file):
with open(replace_table_header_file, "r", encoding="utf-8") as f:
replace_table_header_config = json.load(f).get("details", [])
return replace_table_header_config
else:
return []
def get_pdf_page_text_dict(self) -> dict: def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file) pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text() success, text, page_text_dict = pdf_util.extract_text()
@ -598,7 +608,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num != 25: # if page_num != 16:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -616,7 +626,7 @@ class DataExtraction:
else: else:
previous_page_fund_name = None previous_page_fund_name = None
page_text = replace_special_table_header(page_text) page_text = replace_special_table_header(self.replace_table_header_config, page_text)
extract_data = self.extract_data_by_page( extract_data = self.extract_data_by_page(
page_num, page_num,
page_text, page_text,
@ -681,7 +691,8 @@ class DataExtraction:
) )
if not with_same_structure_table: if not with_same_structure_table:
break break
next_page_text = replace_special_table_header(next_page_text) next_page_text = replace_special_table_header(self.replace_table_header_config,
next_page_text)
target_text = current_text + next_page_text target_text = current_text + next_page_text
else: else:
target_text = "" target_text = ""

View File

@ -1526,8 +1526,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"] # special_doc_id_list = ["553242411"]
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = True
doc_source = "aus_prospectus" doc_source = "aus_prospectus"
# doc_source = "emea_ar" # doc_source = "emea_ar"
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
# "544886057", # "544886057",
# "550769189", # "550769189",
# "553449663"] # "553449663"]
# special_doc_id_list = ["506913190"] # special_doc_id_list = ["411062815"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

View File

@ -1036,7 +1036,7 @@ def remove_abundant_data_detail(data_detail_list: list,
return data_detail_list return data_detail_list
def replace_special_table_header(page_text: str): def replace_special_table_header(replace_table_header_config: list, page_text: str):
""" """
For some special table header, replace to the standard header For some special table header, replace to the standard header
e.g. e.g.
@ -1083,42 +1083,15 @@ def replace_special_table_header(page_text: str):
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
Performance \nfees charged \nby interposed \nvehicles \n Performance \nfees charged \nby interposed \nvehicles \n
""" """
replace_info_list = [ if replace_table_header_config is None or len(replace_table_header_config) == 0:
{ return page_text
# item 0: document 410899007
# item 1: document 539266880, 539266817, 539261734
# item 2: document 539266893
"regex_all_list":
[r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n",
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n",
r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
},
{
# item 0: document 410899007
"regex_all_list":
[r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
},
{
# item 0: document 401212184, page 17 - 20
"regex_all_list":
[r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n",
r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"],
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n"
},
{
# item 0: document 411062815, page 17
"regex_all_list":
[r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"],
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n"
}
]
updated_text = False updated_text = False
for replace_info in replace_info_list: for replace_info in replace_table_header_config:
for regex_all in replace_info["regex_all_list"]: for regex_all in replace_info.get("regex_all_list", []):
if re.search(regex_all, page_text) is not None: table_header_search = re.search(regex_all, page_text)
page_text = re.sub(regex_all, replace_info["replace_text"], page_text) if table_header_search is not None:
original_text = table_header_search.group()
page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text)
updated_text = True updated_text = True
break break
if updated_text: if updated_text: