support separate tables and pages data which with specific biz rules

This commit is contained in:
Blade He 2025-03-31 17:08:49 -05:00
parent 355b145cf7
commit 984c686bf3
5 changed files with 293 additions and 73 deletions

View File

@ -0,0 +1,9 @@
{
"management_fee_including_performance_fee": {
"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
"effective_datapoints": ["management_fee_and_costs"],
"exclude_datapoints": ["performance_fee_costs"],
"provider_ids": ["0C00005549"],
"provider_names": ["Vision Super Pty Ltd"]
}
}

View File

@ -75,11 +75,33 @@ class DataExtraction:
self.datapoint_type_config = self.get_datapoint_type()
self.datapoint_name_config = self.get_datapoint_name()
self.replace_table_header_config = self.get_replace_table_header_config()
self.special_datapoint_feature_config = self.get_special_datapoint_feature_config()
self.special_datapoint_feature = self.init_special_datapoint_feature()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name()
self.extract_way = extract_way
self.output_image_folder = output_image_folder
def get_special_datapoint_feature_config(self) -> dict:
special_datapoint_feature_config_file = os.path.join(self.configuration_folder, "special_datapoint_feature.json")
if not os.path.exists(special_datapoint_feature_config_file):
return {}
special_datapoint_feature_config = {}
with open(special_datapoint_feature_config_file, "r", encoding="utf-8") as f:
special_datapoint_feature_config = json.load(f)
return special_datapoint_feature_config
def init_special_datapoint_feature(self) -> dict:
special_datapoint_feature = {}
if self.special_datapoint_feature_config is None or \
len(list(self.special_datapoint_feature_config.keys())) == 0:
return special_datapoint_feature
for feature in list(self.special_datapoint_feature_config.keys()):
special_datapoint_feature[feature] = {"page_index": []}
return special_datapoint_feature
def get_document_category_production(self):
document_category = None
document_production = None
@ -293,7 +315,9 @@ class DataExtraction:
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.remove_duplicate_data(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
data_list = self.post_adjust_management_fee_costs(data_list)
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
if not adjust:
data_list = self.post_adjust_management_fee_costs(data_list)
data_list = self.check_administration_fees(data_list)
return data_list
@ -703,6 +727,64 @@ class DataExtraction:
raw_name_dict[raw_name] = {"fund_name": fund_name, "share_name": share_name}
return raw_name_dict
def post_management_fee_exclude_performance_fee(self, data_list: list):
adjust = False
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("page_index", [])
if len(mangement_fee_index_list) == 0:
return data_list, adjust
min_page_index = min(mangement_fee_index_list)
performance_fee_item_list = []
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
if page_index <= min_page_index:
continue
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
share_name = data_item.get("share_name", "")
if len(share_name) == 0:
continue
if "performance_fee_costs" in keys:
performance_fee_item_list.append(data_item)
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
if page_index not in mangement_fee_index_list:
continue
extract_data = data_dict.get("extract_data", {})
management_fee_data_list = extract_data.get("data", [])
for management_fee_data in management_fee_data_list:
keys = list(management_fee_data.keys())
fund_name = management_fee_data.get("fund_name", "")
share_name = management_fee_data.get("share_name", "")
if len(fund_name) == 0 or len(share_name) == 0:
continue
if "management_fee_and_costs" in keys:
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
try:
management_fee_and_costs = float(management_fee_and_costs)
except:
management_fee_and_costs = -1
if management_fee_and_costs != -1:
for performance_fee_item in performance_fee_item_list:
pf_fund_name = performance_fee_item.get("fund_name", "")
pf_share_name = performance_fee_item.get("share_name", "")
if pf_fund_name == fund_name and pf_share_name == share_name:
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
try:
performance_fee_costs = float(performance_fee_costs)
except:
performance_fee_costs = -1
if performance_fee_costs != -1:
management_fee_data["management_fee_and_costs"] = management_fee_and_costs - performance_fee_costs
management_fee_data["management_fee"] = management_fee_data["management_fee_and_costs"]
management_fee_data["source"] = f"subtract_performance_fee_{performance_fee_costs}"
adjust = True
break
return data_list, adjust
def post_adjust_management_fee_costs(self, data_list: list):
"""
Adjust the management fee and management fee and costs
@ -861,7 +943,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num not in [13, 14]:
# if page_num not in [37, 38]:
# continue
if page_num in handled_page_num_list:
continue
@ -1249,9 +1331,17 @@ class DataExtraction:
except:
data = {"data": []}
try:
data = self.validate_data(extract_data_info=data,
page_text=page_text,
previous_page_last_fund=previous_page_last_fund)
if self.doc_source == "emea_ar":
data = self.validate_emea_ar_data(extract_data_info=data,
page_text=page_text,
previous_page_last_fund=previous_page_last_fund)
elif self.doc_source == "aus_prospectus":
data = self.validate_aus_prospectus_data(extract_data_info=data,
page_text=page_text,
page_num=page_num,
previous_page_last_fund=previous_page_last_fund)
else:
pass
except:
pass
@ -1366,7 +1456,12 @@ class DataExtraction:
except:
data = {"data": []}
try:
data = self.validate_data(data, None, previous_page_last_fund)
if self.doc_source == "emea_ar":
data = self.validate_emea_ar_data(data, None, previous_page_last_fund)
elif self.doc_source == "aus_prospectus":
data = self.validate_aus_prospectus_data(data, None, page_num, previous_page_last_fund)
else:
pass
except:
pass
@ -1405,7 +1500,7 @@ class DataExtraction:
# print(text)
return text
def validate_data(self,
def validate_emea_ar_data(self,
extract_data_info: dict,
page_text: str,
previous_page_last_fund: str=None) -> dict:
@ -1417,6 +1512,7 @@ class DataExtraction:
data_list = extract_data_info.get("data", [])
if len(data_list) == 0:
return extract_data_info
remove_list = []
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
@ -1436,29 +1532,29 @@ class DataExtraction:
if len(keys) == 0:
remove_list.append(data)
continue
fund_name = data.get("fund name", "").strip()
if fund_name == "":
raw_fund_name = data.get("fund name", "").strip()
if raw_fund_name == "":
remove_list.append(data)
continue
# Clean fund name start
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
previous_page_last_fund = previous_page_last_fund.strip()
if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund:
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
if raw_fund_name.startswith(previous_page_last_fund) and raw_fund_name != previous_page_last_fund:
modified_fund_name = raw_fund_name.replace(previous_page_last_fund, "").strip()
if len(modified_fund_name.split()) > 1:
fund_name = modified_fund_name
fund_name = self.get_fund_name(fund_name, "Fund")
fund_name = self.get_fund_name(fund_name, "Bond")
raw_fund_name = modified_fund_name
raw_fund_name = self.get_fund_name(raw_fund_name, "Fund")
raw_fund_name = self.get_fund_name(raw_fund_name, "Bond")
remove_prefix_list = ["Market Specific Equity Sub-Funds",
"International and Regional Equity Sub-Funds",
"Equity Sub-Funds"]
for remove_item in remove_prefix_list:
if fund_name.startswith(remove_item):
fund_name = fund_name.replace(remove_item, "").strip()
if raw_fund_name.startswith(remove_item):
raw_fund_name = raw_fund_name.replace(remove_item, "").strip()
data["fund name"] = fund_name
data["fund name"] = raw_fund_name
# Clean fund name end
keys = list(data.keys())
@ -1472,11 +1568,11 @@ class DataExtraction:
if ter_search is not None:
include_key_words = True
if not include_key_words:
is_share_name = self.check_fund_name_as_share(fund_name)
is_share_name = self.check_fund_name_as_share(raw_fund_name)
if not is_share_name:
remove_list.append(data)
break
data["share name"] = fund_name
data["share name"] = raw_fund_name
if data.get(key, "") == "":
data.pop(key)
for remove_data in remove_list:
@ -1508,8 +1604,8 @@ class DataExtraction:
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
exist_multi_over_3_share = False
for data in data_list:
fund_name = data.get("fund name", "").strip()
if len(fund_name) == 0:
raw_fund_name = data.get("fund name", "").strip()
if len(raw_fund_name) == 0:
continue
raw_share_name = data.get("share name", "")
if not exist_multi_over_3_share:
@ -1523,7 +1619,7 @@ class DataExtraction:
if len(share_name_list) > 0:
for share_name in share_name_list:
new_data = {}
new_data["fund_name"] = fund_name
new_data["fund_name"] = raw_fund_name
if share_name != "":
new_data["share_name"] = share_name
ter = data.get("ter", None)
@ -1537,10 +1633,118 @@ class DataExtraction:
if key not in ["fund name", "share name", "ter", "performance fees"]:
new_data[key] = value
new_data_list.append(new_data)
extract_data_info["data"] = new_data_list
return extract_data_info
def validate_aus_prospectus_data(self,
extract_data_info: dict,
page_text: str,
page_num: int,
previous_page_last_fund: str=None) -> dict:
data_list = extract_data_info.get("data", [])
if len(data_list) == 0:
return extract_data_info
remove_list = []
for data in data_list:
raw_fund_name = data.get("fund name", "").strip()
if raw_fund_name == "":
remove_list.append(data)
continue
# Clean fund name start
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
previous_page_last_fund = previous_page_last_fund.strip()
if raw_fund_name.startswith(previous_page_last_fund) and raw_fund_name != previous_page_last_fund:
modified_fund_name = raw_fund_name.replace(previous_page_last_fund, "").strip()
if len(modified_fund_name.split()) > 1:
raw_fund_name = modified_fund_name
data["fund name"] = raw_fund_name
for remove_data in remove_list:
if remove_data in data_list:
data_list.remove(remove_data)
new_data_list = []
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
exist_multi_over_3_share = False
for data in data_list:
raw_fund_name = data.get("fund name", "").strip()
if len(raw_fund_name) == 0:
continue
raw_share_name = data.get("share name", "")
if not exist_multi_over_3_share:
multi_over_3_share_search = re.search(multi_over_3_share_regex, raw_share_name)
if multi_over_3_share_search is not None:
exist_multi_over_3_share = True
if exist_multi_over_3_share:
share_name_list = self.split_multi_share_name(raw_share_name)
else:
share_name_list = [raw_share_name]
if len(share_name_list) > 0:
for share_name in share_name_list:
new_data = {}
new_data["fund_name"] = raw_fund_name
if share_name != "":
new_data["share_name"] = share_name
for key, value in data.items():
if key not in ["fund name", "share name"]:
new_data[key] = value
new_data_list.append(new_data)
extract_data_info["data"] = new_data_list
if page_text is not None and len(page_text) > 0:
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
return extract_data_info
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
for feature, properties in self.special_datapoint_feature_config.items():
regex_text_list = properties.get("regex_text", [])
if len(regex_text_list) == 0:
continue
effective_datapoints = properties.get("effective_datapoints", [])
if len(effective_datapoints) == 0:
continue
provider_ids = properties.get("provider_ids", [])
if len(provider_ids) > 0:
is_current_provider = False
doc_provider_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
if len(doc_provider_list) > 0:
for provider in provider_ids:
if provider in doc_provider_list:
is_current_provider = True
break
if not is_current_provider:
continue
exclude_datapoints = properties.get("exclude_datapoints", [])
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_item in data_list:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in effective_datapoints]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in exclude_datapoints]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if not exist_effective_datapoints:
continue
if exist_exclude_datapoints:
continue
found_regex_text = False
for regex_text in regex_text_list:
regex_search = re.search(regex_text, page_text)
if regex_search is not None:
found_regex_text = True
break
if found_regex_text:
if self.special_datapoint_feature[feature].get("page_index", None) is None:
self.special_datapoint_feature[feature]["page_index"] = []
self.special_datapoint_feature[feature]["page_index"].append(page_num)
def split_multi_share_name(self, raw_share_name: str) -> list:
"""
Some document, e.g. 481482392

View File

@ -448,7 +448,7 @@
"A. Exclude reported name",
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ",
"Estimated transaction costs offset by buy/sell spreads (% pa), ",
"Estimated transaction costs offset by buy/sell spreads (% pa), Transaction costs",
"---Example 1 Start---",
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
"---Example 1 End---",
@ -466,6 +466,13 @@
"---Example 3 End---",
"The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:",
"{\"data\": []}",
"\n",
"---Example 4 Start---",
"Transaction costs \nOption % of options assets* \nHigh Growth 0.03% \nTaken into account in the daily calculation\nof unit prices\nMember activity related fees and costs \nBuy-sell spread Nil N/A\nSwitching fee Nil N/A\n",
"---Example 4 End---",
"According to example, please exclude Transaction costs.",
"\"Buy-sell spread\" data section is under \"Member activity related fees and costs\", the value is Nil, output for buy_spread and sell_spread should be:",
"{\"data\": []}",
"B. Simple case with simple table structure:",
"---Example 1 Start---",
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",

View File

@ -1538,7 +1538,7 @@ if __name__ == "__main__":
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0]
# special_doc_id_list = ["448576924"]
special_doc_id_list = ["573372424", "455235248", "462780211"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (

File diff suppressed because one or more lines are too long