apply configuration file to replace disorder table header contents

This commit is contained in:
Blade He 2025-03-10 11:09:00 -05:00
parent 2548606ccc
commit e9f6383258
6 changed files with 69 additions and 43 deletions

View File

@ -1308,6 +1308,10 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
def is_equal(gt_value, pred_value, data_point: str = ""):
if gt_value is not None and len(str(gt_value)) > 0 and \
pred_value is not None and len(str(pred_value)) > 0:
if gt_value == "0.0":
gt_value = "0"
if pred_value == "0.0":
pred_value = "0"
if gt_value == pred_value:
return True
if data_point == "benchmark_name":
@ -1351,7 +1355,7 @@ if __name__ == "__main__":
verify_document_list_file_list = [None,
"./sample_documents/aus_prospectus_29_documents_sample.txt",
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
is_for_all = False
is_for_all = True
for verify_document_list_file in verify_document_list_file_list:
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
audit_data_sheet=audit_data_sheet,

View File

@ -0,0 +1,35 @@
{
"details": [
{
"regex_all_list":
["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n",
"\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n",
"\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
"comments": ["item 0: document 410899007",
"item 1: document 539266880, 539266817, 539261734",
"item 2: document 539266893"]
},
{
"regex_all_list":
["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
"comments": ["item 0: document 410899007"]
},
{
"regex_all_list":
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
"comments": ["item 0: document 401212184, page 17",
"item 1: document 401212184, page 18 - 20"]
},
{
"regex_all_list":
["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
"comments": ["item 0: document 411062815, page 17"]
}
]
}

View File

@ -0,0 +1,3 @@
{
"details": []
}

View File

@ -74,6 +74,7 @@ class DataExtraction:
self.datapoint_level_config = self.get_datapoint_level()
self.datapoint_type_config = self.get_datapoint_type()
self.datapoint_name_config = self.get_datapoint_name()
self.replace_table_header_config = self.get_replace_table_header_config()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name()
self.extract_way = extract_way
@ -206,6 +207,15 @@ class DataExtraction:
datapoint_name = json.load(f)
return datapoint_name
def get_replace_table_header_config(self) -> str:
replace_table_header_file = os.path.join(self.configuration_folder, "replace_table_header.json")
if os.path.exists(replace_table_header_file):
with open(replace_table_header_file, "r", encoding="utf-8") as f:
replace_table_header_config = json.load(f).get("details", [])
return replace_table_header_config
else:
return []
def get_pdf_page_text_dict(self) -> dict:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
@ -598,7 +608,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num != 25:
# if page_num != 16:
# continue
if page_num in handled_page_num_list:
continue
@ -616,7 +626,7 @@ class DataExtraction:
else:
previous_page_fund_name = None
page_text = replace_special_table_header(page_text)
page_text = replace_special_table_header(self.replace_table_header_config, page_text)
extract_data = self.extract_data_by_page(
page_num,
page_text,
@ -681,7 +691,8 @@ class DataExtraction:
)
if not with_same_structure_table:
break
next_page_text = replace_special_table_header(next_page_text)
next_page_text = replace_special_table_header(self.replace_table_header_config,
next_page_text)
target_text = current_text + next_page_text
else:
target_text = ""

View File

@ -1526,8 +1526,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
re_run_extract_data = False
re_run_mapping_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
doc_source = "aus_prospectus"
# doc_source = "emea_ar"
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
# "544886057",
# "550769189",
# "553449663"]
# special_doc_id_list = ["506913190"]
# special_doc_id_list = ["411062815"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

View File

@ -1036,7 +1036,7 @@ def remove_abundant_data_detail(data_detail_list: list,
return data_detail_list
def replace_special_table_header(page_text: str):
def replace_special_table_header(replace_table_header_config: list, page_text: str):
"""
For some special table header, replace to the standard header
e.g.
@ -1083,42 +1083,15 @@ def replace_special_table_header(page_text: str):
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
Performance \nfees charged \nby interposed \nvehicles \n
"""
replace_info_list = [
{
# item 0: document 410899007
# item 1: document 539266880, 539266817, 539261734
# item 2: document 539266893
"regex_all_list":
[r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n",
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n",
r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
},
{
# item 0: document 410899007
"regex_all_list":
[r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"],
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
},
{
# item 0: document 401212184, page 17 - 20
"regex_all_list":
[r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n",
r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"],
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n"
},
{
# item 0: document 411062815, page 17
"regex_all_list":
[r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"],
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n"
}
]
if replace_table_header_config is None or len(replace_table_header_config) == 0:
return page_text
updated_text = False
for replace_info in replace_info_list:
for regex_all in replace_info["regex_all_list"]:
if re.search(regex_all, page_text) is not None:
page_text = re.sub(regex_all, replace_info["replace_text"], page_text)
for replace_info in replace_table_header_config:
for regex_all in replace_info.get("regex_all_list", []):
table_header_search = re.search(regex_all, page_text)
if table_header_search is not None:
original_text = table_header_search.group()
page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text)
updated_text = True
break
if updated_text: