support separate tables and pages data which with specific biz rules
This commit is contained in:
parent
355b145cf7
commit
984c686bf3
|
|
@ -0,0 +1,9 @@
|
|||
{
|
||||
"management_fee_including_performance_fee": {
|
||||
"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
|
||||
"effective_datapoints": ["management_fee_and_costs"],
|
||||
"exclude_datapoints": ["performance_fee_costs"],
|
||||
"provider_ids": ["0C00005549"],
|
||||
"provider_names": ["Vision Super Pty Ltd"]
|
||||
}
|
||||
}
|
||||
|
|
@ -75,11 +75,33 @@ class DataExtraction:
|
|||
self.datapoint_type_config = self.get_datapoint_type()
|
||||
self.datapoint_name_config = self.get_datapoint_name()
|
||||
self.replace_table_header_config = self.get_replace_table_header_config()
|
||||
self.special_datapoint_feature_config = self.get_special_datapoint_feature_config()
|
||||
self.special_datapoint_feature = self.init_special_datapoint_feature()
|
||||
|
||||
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
|
||||
self.get_datapoint_reported_name()
|
||||
self.extract_way = extract_way
|
||||
self.output_image_folder = output_image_folder
|
||||
|
||||
def get_special_datapoint_feature_config(self) -> dict:
|
||||
special_datapoint_feature_config_file = os.path.join(self.configuration_folder, "special_datapoint_feature.json")
|
||||
if not os.path.exists(special_datapoint_feature_config_file):
|
||||
return {}
|
||||
special_datapoint_feature_config = {}
|
||||
with open(special_datapoint_feature_config_file, "r", encoding="utf-8") as f:
|
||||
special_datapoint_feature_config = json.load(f)
|
||||
|
||||
return special_datapoint_feature_config
|
||||
|
||||
def init_special_datapoint_feature(self) -> dict:
|
||||
special_datapoint_feature = {}
|
||||
if self.special_datapoint_feature_config is None or \
|
||||
len(list(self.special_datapoint_feature_config.keys())) == 0:
|
||||
return special_datapoint_feature
|
||||
for feature in list(self.special_datapoint_feature_config.keys()):
|
||||
special_datapoint_feature[feature] = {"page_index": []}
|
||||
return special_datapoint_feature
|
||||
|
||||
def get_document_category_production(self):
|
||||
document_category = None
|
||||
document_production = None
|
||||
|
|
@ -293,6 +315,8 @@ class DataExtraction:
|
|||
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
|
||||
data_list = self.remove_duplicate_data(data_list)
|
||||
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
|
||||
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
|
||||
if not adjust:
|
||||
data_list = self.post_adjust_management_fee_costs(data_list)
|
||||
|
||||
data_list = self.check_administration_fees(data_list)
|
||||
|
|
@ -703,6 +727,64 @@ class DataExtraction:
|
|||
raw_name_dict[raw_name] = {"fund_name": fund_name, "share_name": share_name}
|
||||
return raw_name_dict
|
||||
|
||||
def post_management_fee_exclude_performance_fee(self, data_list: list):
|
||||
adjust = False
|
||||
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
|
||||
get("page_index", [])
|
||||
if len(mangement_fee_index_list) == 0:
|
||||
return data_list, adjust
|
||||
min_page_index = min(mangement_fee_index_list)
|
||||
performance_fee_item_list = []
|
||||
for data_dict in data_list:
|
||||
page_index = data_dict.get("page_index", -1)
|
||||
if page_index <= min_page_index:
|
||||
continue
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
data = extract_data.get("data", [])
|
||||
for data_item in data:
|
||||
keys = list(data_item.keys())
|
||||
share_name = data_item.get("share_name", "")
|
||||
if len(share_name) == 0:
|
||||
continue
|
||||
if "performance_fee_costs" in keys:
|
||||
performance_fee_item_list.append(data_item)
|
||||
|
||||
for data_dict in data_list:
|
||||
page_index = data_dict.get("page_index", -1)
|
||||
if page_index not in mangement_fee_index_list:
|
||||
continue
|
||||
extract_data = data_dict.get("extract_data", {})
|
||||
management_fee_data_list = extract_data.get("data", [])
|
||||
for management_fee_data in management_fee_data_list:
|
||||
keys = list(management_fee_data.keys())
|
||||
fund_name = management_fee_data.get("fund_name", "")
|
||||
share_name = management_fee_data.get("share_name", "")
|
||||
if len(fund_name) == 0 or len(share_name) == 0:
|
||||
continue
|
||||
if "management_fee_and_costs" in keys:
|
||||
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
|
||||
try:
|
||||
management_fee_and_costs = float(management_fee_and_costs)
|
||||
except:
|
||||
management_fee_and_costs = -1
|
||||
if management_fee_and_costs != -1:
|
||||
for performance_fee_item in performance_fee_item_list:
|
||||
pf_fund_name = performance_fee_item.get("fund_name", "")
|
||||
pf_share_name = performance_fee_item.get("share_name", "")
|
||||
if pf_fund_name == fund_name and pf_share_name == share_name:
|
||||
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
|
||||
try:
|
||||
performance_fee_costs = float(performance_fee_costs)
|
||||
except:
|
||||
performance_fee_costs = -1
|
||||
if performance_fee_costs != -1:
|
||||
management_fee_data["management_fee_and_costs"] = management_fee_and_costs - performance_fee_costs
|
||||
management_fee_data["management_fee"] = management_fee_data["management_fee_and_costs"]
|
||||
management_fee_data["source"] = f"subtract_performance_fee_{performance_fee_costs}"
|
||||
adjust = True
|
||||
break
|
||||
return data_list, adjust
|
||||
|
||||
def post_adjust_management_fee_costs(self, data_list: list):
|
||||
"""
|
||||
Adjust the management fee and management fee and costs
|
||||
|
|
@ -861,7 +943,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num not in [13, 14]:
|
||||
# if page_num not in [37, 38]:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
@ -1249,9 +1331,17 @@ class DataExtraction:
|
|||
except:
|
||||
data = {"data": []}
|
||||
try:
|
||||
data = self.validate_data(extract_data_info=data,
|
||||
if self.doc_source == "emea_ar":
|
||||
data = self.validate_emea_ar_data(extract_data_info=data,
|
||||
page_text=page_text,
|
||||
previous_page_last_fund=previous_page_last_fund)
|
||||
elif self.doc_source == "aus_prospectus":
|
||||
data = self.validate_aus_prospectus_data(extract_data_info=data,
|
||||
page_text=page_text,
|
||||
page_num=page_num,
|
||||
previous_page_last_fund=previous_page_last_fund)
|
||||
else:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
|
|
@ -1366,7 +1456,12 @@ class DataExtraction:
|
|||
except:
|
||||
data = {"data": []}
|
||||
try:
|
||||
data = self.validate_data(data, None, previous_page_last_fund)
|
||||
if self.doc_source == "emea_ar":
|
||||
data = self.validate_emea_ar_data(data, None, previous_page_last_fund)
|
||||
elif self.doc_source == "aus_prospectus":
|
||||
data = self.validate_aus_prospectus_data(data, None, page_num, previous_page_last_fund)
|
||||
else:
|
||||
pass
|
||||
except:
|
||||
pass
|
||||
|
||||
|
|
@ -1405,7 +1500,7 @@ class DataExtraction:
|
|||
# print(text)
|
||||
return text
|
||||
|
||||
def validate_data(self,
|
||||
def validate_emea_ar_data(self,
|
||||
extract_data_info: dict,
|
||||
page_text: str,
|
||||
previous_page_last_fund: str=None) -> dict:
|
||||
|
|
@ -1417,6 +1512,7 @@ class DataExtraction:
|
|||
data_list = extract_data_info.get("data", [])
|
||||
if len(data_list) == 0:
|
||||
return extract_data_info
|
||||
|
||||
remove_list = []
|
||||
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
|
||||
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
|
||||
|
|
@ -1436,29 +1532,29 @@ class DataExtraction:
|
|||
if len(keys) == 0:
|
||||
remove_list.append(data)
|
||||
continue
|
||||
fund_name = data.get("fund name", "").strip()
|
||||
if fund_name == "":
|
||||
raw_fund_name = data.get("fund name", "").strip()
|
||||
if raw_fund_name == "":
|
||||
remove_list.append(data)
|
||||
continue
|
||||
|
||||
# Clean fund name start
|
||||
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
||||
previous_page_last_fund = previous_page_last_fund.strip()
|
||||
if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund:
|
||||
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
|
||||
if raw_fund_name.startswith(previous_page_last_fund) and raw_fund_name != previous_page_last_fund:
|
||||
modified_fund_name = raw_fund_name.replace(previous_page_last_fund, "").strip()
|
||||
if len(modified_fund_name.split()) > 1:
|
||||
fund_name = modified_fund_name
|
||||
fund_name = self.get_fund_name(fund_name, "Fund")
|
||||
fund_name = self.get_fund_name(fund_name, "Bond")
|
||||
raw_fund_name = modified_fund_name
|
||||
raw_fund_name = self.get_fund_name(raw_fund_name, "Fund")
|
||||
raw_fund_name = self.get_fund_name(raw_fund_name, "Bond")
|
||||
|
||||
remove_prefix_list = ["Market Specific Equity Sub-Funds",
|
||||
"International and Regional Equity Sub-Funds",
|
||||
"Equity Sub-Funds"]
|
||||
for remove_item in remove_prefix_list:
|
||||
if fund_name.startswith(remove_item):
|
||||
fund_name = fund_name.replace(remove_item, "").strip()
|
||||
if raw_fund_name.startswith(remove_item):
|
||||
raw_fund_name = raw_fund_name.replace(remove_item, "").strip()
|
||||
|
||||
data["fund name"] = fund_name
|
||||
data["fund name"] = raw_fund_name
|
||||
# Clean fund name end
|
||||
|
||||
keys = list(data.keys())
|
||||
|
|
@ -1472,11 +1568,11 @@ class DataExtraction:
|
|||
if ter_search is not None:
|
||||
include_key_words = True
|
||||
if not include_key_words:
|
||||
is_share_name = self.check_fund_name_as_share(fund_name)
|
||||
is_share_name = self.check_fund_name_as_share(raw_fund_name)
|
||||
if not is_share_name:
|
||||
remove_list.append(data)
|
||||
break
|
||||
data["share name"] = fund_name
|
||||
data["share name"] = raw_fund_name
|
||||
if data.get(key, "") == "":
|
||||
data.pop(key)
|
||||
for remove_data in remove_list:
|
||||
|
|
@ -1508,8 +1604,8 @@ class DataExtraction:
|
|||
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
|
||||
exist_multi_over_3_share = False
|
||||
for data in data_list:
|
||||
fund_name = data.get("fund name", "").strip()
|
||||
if len(fund_name) == 0:
|
||||
raw_fund_name = data.get("fund name", "").strip()
|
||||
if len(raw_fund_name) == 0:
|
||||
continue
|
||||
raw_share_name = data.get("share name", "")
|
||||
if not exist_multi_over_3_share:
|
||||
|
|
@ -1523,7 +1619,7 @@ class DataExtraction:
|
|||
if len(share_name_list) > 0:
|
||||
for share_name in share_name_list:
|
||||
new_data = {}
|
||||
new_data["fund_name"] = fund_name
|
||||
new_data["fund_name"] = raw_fund_name
|
||||
if share_name != "":
|
||||
new_data["share_name"] = share_name
|
||||
ter = data.get("ter", None)
|
||||
|
|
@ -1541,6 +1637,114 @@ class DataExtraction:
|
|||
extract_data_info["data"] = new_data_list
|
||||
return extract_data_info
|
||||
|
||||
def validate_aus_prospectus_data(self,
|
||||
extract_data_info: dict,
|
||||
page_text: str,
|
||||
page_num: int,
|
||||
previous_page_last_fund: str=None) -> dict:
|
||||
data_list = extract_data_info.get("data", [])
|
||||
if len(data_list) == 0:
|
||||
return extract_data_info
|
||||
remove_list = []
|
||||
for data in data_list:
|
||||
raw_fund_name = data.get("fund name", "").strip()
|
||||
if raw_fund_name == "":
|
||||
remove_list.append(data)
|
||||
continue
|
||||
|
||||
# Clean fund name start
|
||||
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
|
||||
previous_page_last_fund = previous_page_last_fund.strip()
|
||||
if raw_fund_name.startswith(previous_page_last_fund) and raw_fund_name != previous_page_last_fund:
|
||||
modified_fund_name = raw_fund_name.replace(previous_page_last_fund, "").strip()
|
||||
if len(modified_fund_name.split()) > 1:
|
||||
raw_fund_name = modified_fund_name
|
||||
data["fund name"] = raw_fund_name
|
||||
for remove_data in remove_list:
|
||||
if remove_data in data_list:
|
||||
data_list.remove(remove_data)
|
||||
|
||||
new_data_list = []
|
||||
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
|
||||
exist_multi_over_3_share = False
|
||||
for data in data_list:
|
||||
raw_fund_name = data.get("fund name", "").strip()
|
||||
if len(raw_fund_name) == 0:
|
||||
continue
|
||||
raw_share_name = data.get("share name", "")
|
||||
if not exist_multi_over_3_share:
|
||||
multi_over_3_share_search = re.search(multi_over_3_share_regex, raw_share_name)
|
||||
if multi_over_3_share_search is not None:
|
||||
exist_multi_over_3_share = True
|
||||
if exist_multi_over_3_share:
|
||||
share_name_list = self.split_multi_share_name(raw_share_name)
|
||||
else:
|
||||
share_name_list = [raw_share_name]
|
||||
if len(share_name_list) > 0:
|
||||
for share_name in share_name_list:
|
||||
new_data = {}
|
||||
new_data["fund_name"] = raw_fund_name
|
||||
if share_name != "":
|
||||
new_data["share_name"] = share_name
|
||||
for key, value in data.items():
|
||||
if key not in ["fund name", "share name"]:
|
||||
new_data[key] = value
|
||||
new_data_list.append(new_data)
|
||||
extract_data_info["data"] = new_data_list
|
||||
if page_text is not None and len(page_text) > 0:
|
||||
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
|
||||
return extract_data_info
|
||||
|
||||
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
|
||||
for feature, properties in self.special_datapoint_feature_config.items():
|
||||
regex_text_list = properties.get("regex_text", [])
|
||||
if len(regex_text_list) == 0:
|
||||
continue
|
||||
effective_datapoints = properties.get("effective_datapoints", [])
|
||||
if len(effective_datapoints) == 0:
|
||||
continue
|
||||
provider_ids = properties.get("provider_ids", [])
|
||||
if len(provider_ids) > 0:
|
||||
is_current_provider = False
|
||||
doc_provider_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
|
||||
if len(doc_provider_list) > 0:
|
||||
for provider in provider_ids:
|
||||
if provider in doc_provider_list:
|
||||
is_current_provider = True
|
||||
break
|
||||
if not is_current_provider:
|
||||
continue
|
||||
exclude_datapoints = properties.get("exclude_datapoints", [])
|
||||
|
||||
exist_effective_datapoints = False
|
||||
exist_exclude_datapoints = False
|
||||
for data_item in data_list:
|
||||
datapoints = [datapoint for datapoint in list(data_item.keys())
|
||||
if datapoint in effective_datapoints]
|
||||
if len(datapoints) > 0:
|
||||
exist_effective_datapoints = True
|
||||
datapoints = [datapoint for datapoint in list(data_item.keys())
|
||||
if datapoint in exclude_datapoints]
|
||||
if len(datapoints) > 0:
|
||||
exist_exclude_datapoints = True
|
||||
if exist_effective_datapoints and exist_exclude_datapoints:
|
||||
break
|
||||
|
||||
if not exist_effective_datapoints:
|
||||
continue
|
||||
if exist_exclude_datapoints:
|
||||
continue
|
||||
found_regex_text = False
|
||||
for regex_text in regex_text_list:
|
||||
regex_search = re.search(regex_text, page_text)
|
||||
if regex_search is not None:
|
||||
found_regex_text = True
|
||||
break
|
||||
if found_regex_text:
|
||||
if self.special_datapoint_feature[feature].get("page_index", None) is None:
|
||||
self.special_datapoint_feature[feature]["page_index"] = []
|
||||
self.special_datapoint_feature[feature]["page_index"].append(page_num)
|
||||
|
||||
def split_multi_share_name(self, raw_share_name: str) -> list:
|
||||
"""
|
||||
Some document, e.g. 481482392
|
||||
|
|
|
|||
|
|
@ -448,7 +448,7 @@
|
|||
"A. Exclude reported name",
|
||||
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
|
||||
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ",
|
||||
"Estimated transaction costs offset by buy/sell spreads (% pa), ",
|
||||
"Estimated transaction costs offset by buy/sell spreads (% pa), Transaction costs",
|
||||
"---Example 1 Start---",
|
||||
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
|
||||
"---Example 1 End---",
|
||||
|
|
@ -466,6 +466,13 @@
|
|||
"---Example 3 End---",
|
||||
"The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:",
|
||||
"{\"data\": []}",
|
||||
"\n",
|
||||
"---Example 4 Start---",
|
||||
"Transaction costs \nOption % of option’s assets* \nHigh Growth 0.03% \nTaken into account in the daily calculation\nof unit prices\nMember activity related fees and costs \nBuy-sell spread Nil N/A\nSwitching fee Nil N/A\n",
|
||||
"---Example 4 End---",
|
||||
"According to example, please exclude Transaction costs.",
|
||||
"\"Buy-sell spread\" data section is under \"Member activity related fees and costs\", the value is Nil, output for buy_spread and sell_spread should be:",
|
||||
"{\"data\": []}",
|
||||
"B. Simple case with simple table structure:",
|
||||
"---Example 1 Start---",
|
||||
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
|
||||
|
|
|
|||
2
main.py
2
main.py
|
|
@ -1538,7 +1538,7 @@ if __name__ == "__main__":
|
|||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||
if len(doc_id.strip()) > 0]
|
||||
# special_doc_id_list = ["448576924"]
|
||||
special_doc_id_list = ["573372424", "455235248", "462780211"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue