apply configuration file to replace disorder table header contents
This commit is contained in:
parent
2548606ccc
commit
e9f6383258
|
|
@ -1308,6 +1308,10 @@ def get_gt_pred_by_compare_values(gt_value, pred_value, gt_list, pred_list, data
|
||||||
def is_equal(gt_value, pred_value, data_point: str = ""):
|
def is_equal(gt_value, pred_value, data_point: str = ""):
|
||||||
if gt_value is not None and len(str(gt_value)) > 0 and \
|
if gt_value is not None and len(str(gt_value)) > 0 and \
|
||||||
pred_value is not None and len(str(pred_value)) > 0:
|
pred_value is not None and len(str(pred_value)) > 0:
|
||||||
|
if gt_value == "0.0":
|
||||||
|
gt_value = "0"
|
||||||
|
if pred_value == "0.0":
|
||||||
|
pred_value = "0"
|
||||||
if gt_value == pred_value:
|
if gt_value == pred_value:
|
||||||
return True
|
return True
|
||||||
if data_point == "benchmark_name":
|
if data_point == "benchmark_name":
|
||||||
|
|
@ -1351,7 +1355,7 @@ if __name__ == "__main__":
|
||||||
verify_document_list_file_list = [None,
|
verify_document_list_file_list = [None,
|
||||||
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
"./sample_documents/aus_prospectus_29_documents_sample.txt",
|
||||||
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
"./sample_documents/aus_prospectus_17_documents_sample.txt"]
|
||||||
is_for_all = False
|
is_for_all = True
|
||||||
for verify_document_list_file in verify_document_list_file_list:
|
for verify_document_list_file in verify_document_list_file_list:
|
||||||
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
calculate_metrics_based_db_data_file(audit_file_path=audit_file_path,
|
||||||
audit_data_sheet=audit_data_sheet,
|
audit_data_sheet=audit_data_sheet,
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
{
|
||||||
|
"details": [
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n",
|
||||||
|
"\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n",
|
||||||
|
"\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"],
|
||||||
|
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
|
||||||
|
"comments": ["item 0: document 410899007",
|
||||||
|
"item 1: document 539266880, 539266817, 539261734",
|
||||||
|
"item 2: document 539266893"]
|
||||||
|
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"],
|
||||||
|
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
|
||||||
|
"comments": ["item 0: document 410899007"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
|
||||||
|
"Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
|
||||||
|
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n",
|
||||||
|
"comments": ["item 0: document 401212184, page 17",
|
||||||
|
"item 1: document 401212184, page 18 - 20"]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"regex_all_list":
|
||||||
|
["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
|
||||||
|
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
|
||||||
|
"comments": ["item 0: document 411062815, page 17"]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"details": []
|
||||||
|
}
|
||||||
|
|
@ -74,6 +74,7 @@ class DataExtraction:
|
||||||
self.datapoint_level_config = self.get_datapoint_level()
|
self.datapoint_level_config = self.get_datapoint_level()
|
||||||
self.datapoint_type_config = self.get_datapoint_type()
|
self.datapoint_type_config = self.get_datapoint_type()
|
||||||
self.datapoint_name_config = self.get_datapoint_name()
|
self.datapoint_name_config = self.get_datapoint_name()
|
||||||
|
self.replace_table_header_config = self.get_replace_table_header_config()
|
||||||
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
|
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
|
||||||
self.get_datapoint_reported_name()
|
self.get_datapoint_reported_name()
|
||||||
self.extract_way = extract_way
|
self.extract_way = extract_way
|
||||||
|
|
@ -206,6 +207,15 @@ class DataExtraction:
|
||||||
datapoint_name = json.load(f)
|
datapoint_name = json.load(f)
|
||||||
return datapoint_name
|
return datapoint_name
|
||||||
|
|
||||||
|
def get_replace_table_header_config(self) -> str:
|
||||||
|
replace_table_header_file = os.path.join(self.configuration_folder, "replace_table_header.json")
|
||||||
|
if os.path.exists(replace_table_header_file):
|
||||||
|
with open(replace_table_header_file, "r", encoding="utf-8") as f:
|
||||||
|
replace_table_header_config = json.load(f).get("details", [])
|
||||||
|
return replace_table_header_config
|
||||||
|
else:
|
||||||
|
return []
|
||||||
|
|
||||||
def get_pdf_page_text_dict(self) -> dict:
|
def get_pdf_page_text_dict(self) -> dict:
|
||||||
pdf_util = PDFUtil(self.pdf_file)
|
pdf_util = PDFUtil(self.pdf_file)
|
||||||
success, text, page_text_dict = pdf_util.extract_text()
|
success, text, page_text_dict = pdf_util.extract_text()
|
||||||
|
|
@ -598,7 +608,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num != 25:
|
# if page_num != 16:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
@ -616,7 +626,7 @@ class DataExtraction:
|
||||||
else:
|
else:
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
|
|
||||||
page_text = replace_special_table_header(page_text)
|
page_text = replace_special_table_header(self.replace_table_header_config, page_text)
|
||||||
extract_data = self.extract_data_by_page(
|
extract_data = self.extract_data_by_page(
|
||||||
page_num,
|
page_num,
|
||||||
page_text,
|
page_text,
|
||||||
|
|
@ -681,7 +691,8 @@ class DataExtraction:
|
||||||
)
|
)
|
||||||
if not with_same_structure_table:
|
if not with_same_structure_table:
|
||||||
break
|
break
|
||||||
next_page_text = replace_special_table_header(next_page_text)
|
next_page_text = replace_special_table_header(self.replace_table_header_config,
|
||||||
|
next_page_text)
|
||||||
target_text = current_text + next_page_text
|
target_text = current_text + next_page_text
|
||||||
else:
|
else:
|
||||||
target_text = ""
|
target_text = ""
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -1526,8 +1526,8 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# special_doc_id_list = ["553242411"]
|
# special_doc_id_list = ["553242411"]
|
||||||
|
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
doc_source = "aus_prospectus"
|
doc_source = "aus_prospectus"
|
||||||
# doc_source = "emea_ar"
|
# doc_source = "emea_ar"
|
||||||
|
|
@ -1560,7 +1560,7 @@ if __name__ == "__main__":
|
||||||
# "544886057",
|
# "544886057",
|
||||||
# "550769189",
|
# "550769189",
|
||||||
# "553449663"]
|
# "553449663"]
|
||||||
# special_doc_id_list = ["506913190"]
|
# special_doc_id_list = ["411062815"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
|
|
@ -1036,7 +1036,7 @@ def remove_abundant_data_detail(data_detail_list: list,
|
||||||
return data_detail_list
|
return data_detail_list
|
||||||
|
|
||||||
|
|
||||||
def replace_special_table_header(page_text: str):
|
def replace_special_table_header(replace_table_header_config: list, page_text: str):
|
||||||
"""
|
"""
|
||||||
For some special table header, replace to the standard header
|
For some special table header, replace to the standard header
|
||||||
e.g.
|
e.g.
|
||||||
|
|
@ -1083,42 +1083,15 @@ def replace_special_table_header(page_text: str):
|
||||||
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
|
Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
|
||||||
Performance \nfees charged \nby interposed \nvehicles \n
|
Performance \nfees charged \nby interposed \nvehicles \n
|
||||||
"""
|
"""
|
||||||
replace_info_list = [
|
if replace_table_header_config is None or len(replace_table_header_config) == 0:
|
||||||
{
|
return page_text
|
||||||
# item 0: document 410899007
|
|
||||||
# item 1: document 539266880, 539266817, 539261734
|
|
||||||
# item 2: document 539266893
|
|
||||||
"regex_all_list":
|
|
||||||
[r"\nIndirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n",
|
|
||||||
r"\n(Investment\s*Option|Fund)[\s\S]*?Management\s*fee[\s\S]*?Indirect\s*costs[\s\S]*?performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?indirect\s*costs[\s\S]*?(interposed\s*vehicles|managers\s*vehicles)\s*\n",
|
|
||||||
r"\nOption\s*name\s*Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Management\s*fee[\s\S]*?Buy\/sell\s*spreads[\s\S]*?Recoverable\s*expenses[\s\S]*?interposed\s*vehicles\s*\n"],
|
|
||||||
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
# item 0: document 410899007
|
|
||||||
"regex_all_list":
|
|
||||||
[r"Indirect costs[\s\S]*?Estimated\s*performance\s*fees[\s\S]*?Investment\s*Option\s*Management\s*fee[\s\S]*?Transactions\s*costs[\s\S]*?Buy\/sell\s*spreads\s*\(\%\)\s*\n"],
|
|
||||||
"replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
# item 0: document 401212184, page 17 - 20
|
|
||||||
"regex_all_list":
|
|
||||||
[r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Entry\s*Fee[\s\S]*?Nil\s*Entry[\s\S]*?Other\s*investment\s*costs[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Nil\s*Entry\s*Fee\s*.*\n",
|
|
||||||
r"Management\s*Fees\s*and\s*costs\s*[\s\S]*?Ongoing\s*Fee.*?\(A\)[\s\S]*?\(D\)\s*Total\s*Fees\s*and\s*Costs\s*Investment\s*fund\s*Estimated\s*Other[\s\S]*?Entry\s*Fee\s*Nil\s*Entry[\s\S]*?Nil\s*Entry[\s\S]*?Performance\s*fees[\s\S]*?Transaction\s*costs[\s\S]*?Fee\s*option.*\n"],
|
|
||||||
"replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nEstimated Transaction costs \nOther option 1 \nOther option 2 \n"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
# item 0: document 411062815, page 17
|
|
||||||
"regex_all_list":
|
|
||||||
[r"Investment\s*option\s*Administration fees[\s\S]*?administration\s*costs\s*Investment\s*fees[\s\S]*?investment\s*costs\s*Administration\s*fees[\s\S]*?Investment\s*fees[\s\S]*?Estimated\s*administration[\s\S]*?transaction\s*costs[\s\S]*?annual\s*fees\s*and\s*costs\s*\(\%\s*pa\)\s*\n"],
|
|
||||||
"replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
updated_text = False
|
updated_text = False
|
||||||
for replace_info in replace_info_list:
|
for replace_info in replace_table_header_config:
|
||||||
for regex_all in replace_info["regex_all_list"]:
|
for regex_all in replace_info.get("regex_all_list", []):
|
||||||
if re.search(regex_all, page_text) is not None:
|
table_header_search = re.search(regex_all, page_text)
|
||||||
page_text = re.sub(regex_all, replace_info["replace_text"], page_text)
|
if table_header_search is not None:
|
||||||
|
original_text = table_header_search.group()
|
||||||
|
page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text)
|
||||||
updated_text = True
|
updated_text = True
|
||||||
break
|
break
|
||||||
if updated_text:
|
if updated_text:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue