1. fit the scenario when document type is not 1 or 4, 5

2. support the scenario:
"investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
This commit is contained in:
Blade He 2025-04-03 17:06:43 -05:00
parent 4b896f4460
commit f333cc30f5
5 changed files with 128 additions and 52 deletions

View File

@ -1,9 +1,42 @@
{
"management_fee_including_performance_fee": {
"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
"effective_datapoints": ["management_fee_and_costs"],
"exclude_datapoints": ["performance_fee_costs"],
"provider_ids": ["0C00005549"],
"provider_names": ["Vision Super Pty Ltd"]
"details": [
{"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
"effective_datapoints": ["management_fee_and_costs"],
"exclude_datapoints": ["performance_fee_costs"]},
{"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
"effective_datapoints": ["performance_fee_costs"],
"exclude_datapoints": ["management_fee_and_costs"]}
],
"provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK",
"0C000069VJ", "0C0000AL58", "0C00006B9E",
"0C00006BDB", "0C00006BDD", "0C00006BDG",
"0C000035YC", "0C0000CSKN", "0C00005549",
"0C000051C6", "0C00008JA0", "0C000093Z4",
"0C0000B5L6", "0C00006EGK", "0C00006EJI",
"0C00006FYL", "0C00006G0Q", "0C00006GIF",
"0C00006GNW", "0C00006GPU", "0C00006H46",
"0C00006H4J", "0C00006H4Q", "0C0000A5XQ",
"0C0000BBPL", "0C0000C2MS", "0C0000CVRL",
"0C0000AV6P", "0C00001XXQ", "0C00001XYR",
"0C00006AZB", "0C00006BN6", "0C00006BXE",
"0C00006CIK", "0C00006CJ2", "0C00006DOA",
"0C0000CAQF", "0C0000CAQH", "0C0000CAQO",
"0C0000CAQR"],
"provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd",
"Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd",
"RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd",
"JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd",
"Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd",
"GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd",
"Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd",
"Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd",
"Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd",
"Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd",
"Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd",
"AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd",
"Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd",
"KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd",
"PostSuper Pty Ltd", "Legal Super Pty Ltd"]
}
}

View File

@ -99,7 +99,7 @@ class DataExtraction:
len(list(self.special_datapoint_feature_config.keys())) == 0:
return special_datapoint_feature
for feature in list(self.special_datapoint_feature_config.keys()):
special_datapoint_feature[feature] = {"page_index": []}
special_datapoint_feature[feature] = {}
return special_datapoint_feature
def get_document_category_production(self):
@ -153,7 +153,6 @@ class DataExtraction:
pass
return fund_name
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
"""
If document source is aus_propectus and document category is MIS
@ -558,6 +557,8 @@ class DataExtraction:
"""
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
"""
if len(self.fund_name_list) < 3:
return data_list, []
raw_name_dict = self.get_raw_name_dict(data_list)
raw_name_list = list(raw_name_dict.keys())
if len(raw_name_list) < 3:
@ -729,10 +730,34 @@ class DataExtraction:
def post_management_fee_exclude_performance_fee(self, data_list: list):
adjust = False
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("page_index", [])
if len(mangement_fee_index_list) == 0:
return data_list, adjust
effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("datapoint", "")
if effective_datapoint == "performance_fee_costs":
mangement_fee_index_list = []
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
data = data_dict.get("extract_data", {}).get("data", [])
for data_item in data:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint == "management_fee_and_costs"]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint == "performance_fee_costs"]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if exist_effective_datapoints and not exist_exclude_datapoints:
if page_index not in mangement_fee_index_list:
mangement_fee_index_list.append(page_index)
min_page_index = min(mangement_fee_index_list)
performance_fee_item_list = []
for data_dict in data_list:
@ -759,7 +784,7 @@ class DataExtraction:
keys = list(management_fee_data.keys())
fund_name = management_fee_data.get("fund_name", "")
share_name = management_fee_data.get("share_name", "")
if len(fund_name) == 0 or len(share_name) == 0:
if fund_name == "" or share_name == "":
continue
if "management_fee_and_costs" in keys:
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
@ -771,7 +796,9 @@ class DataExtraction:
for performance_fee_item in performance_fee_item_list:
pf_fund_name = performance_fee_item.get("fund_name", "")
pf_share_name = performance_fee_item.get("share_name", "")
if pf_fund_name == fund_name and pf_share_name == share_name:
if pf_fund_name == "" or pf_share_name == "":
continue
if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
try:
performance_fee_costs = float(performance_fee_costs)
@ -943,7 +970,7 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
# if page_num not in [37, 38]:
# if page_num not in [42]:
# continue
if page_num in handled_page_num_list:
continue
@ -1692,16 +1719,15 @@ class DataExtraction:
new_data_list.append(new_data)
extract_data_info["data"] = new_data_list
if page_text is not None and len(page_text) > 0:
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
try:
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
except Exception as e:
logger.error(f"Error in setting datapoint feature properties: {e}")
return extract_data_info
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
for feature, properties in self.special_datapoint_feature_config.items():
regex_text_list = properties.get("regex_text", [])
if len(regex_text_list) == 0:
continue
effective_datapoints = properties.get("effective_datapoints", [])
if len(effective_datapoints) == 0:
if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
continue
provider_ids = properties.get("provider_ids", [])
if len(provider_ids) > 0:
@ -1714,36 +1740,51 @@ class DataExtraction:
break
if not is_current_provider:
continue
exclude_datapoints = properties.get("exclude_datapoints", [])
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_item in data_list:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in effective_datapoints]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in exclude_datapoints]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
detail_list = properties.get("details", [])
if len(detail_list) == 0:
continue
set_feature_property = False
for detail in detail_list:
regex_text_list = detail.get("regex_text", [])
if len(regex_text_list) == 0:
continue
effective_datapoints = detail.get("effective_datapoints", [])
if len(effective_datapoints) == 0:
continue
exclude_datapoints = detail.get("exclude_datapoints", [])
if not exist_effective_datapoints:
continue
if exist_exclude_datapoints:
continue
found_regex_text = False
for regex_text in regex_text_list:
regex_search = re.search(regex_text, page_text, re.IGNORECASE)
if regex_search is not None:
found_regex_text = True
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_item in data_list:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in effective_datapoints]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in exclude_datapoints]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if not exist_effective_datapoints:
continue
if exist_exclude_datapoints:
continue
found_regex_text = False
for regex_text in regex_text_list:
regex_search = re.search(regex_text, page_text, re.IGNORECASE)
if regex_search is not None:
found_regex_text = True
break
if found_regex_text:
if self.special_datapoint_feature[feature].get("page_index", None) is None:
self.special_datapoint_feature[feature]["page_index"] = []
self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
self.special_datapoint_feature[feature]["page_index"].append(page_num)
set_feature_property = True
if set_feature_property:
break
if found_regex_text:
if self.special_datapoint_feature[feature].get("page_index", None) is None:
self.special_datapoint_feature[feature]["page_index"] = []
self.special_datapoint_feature[feature]["page_index"].append(page_num)
def split_multi_share_name(self, raw_share_name: str) -> list:
"""

View File

@ -148,9 +148,9 @@ class FilterPages:
}
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
if document_type in [4, 5]:
if document_type in [4, 5] or self.doc_source == "emea_ar":
document_type = "ar"
elif document_type == 1:
elif document_type == 1 or self.doc_source == "aus_prospectus":
document_type = "prospectus"
language_id = self.document_mapping_info_df["Language"].iloc[0]
language = self.language_config.get(language_id, None)

View File

@ -71,6 +71,7 @@
"If with multiple data values in same row, please extract the latest.",
"\n",
"4. Reported names:",
"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
"Only output the values which with significant reported names.",
"Multiple data columns with same reported name but different post-fix:",
"If there are multiple reported names with different post-fix text, here is the priority rule:",
@ -122,7 +123,7 @@
"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",

11
main.py
View File

@ -1531,17 +1531,18 @@ if __name__ == "__main__":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
)
# document_sample_file = (
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
# r"./sample_documents/aus_prospectus_46_documents_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
)
logger.info(f"Start to run document sample file: {document_sample_file}")
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0]
# special_doc_id_list = ["527969661"]
# special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
# special_doc_id_list = ["539999907", "455235248", "448576924"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (