1. fit the scenario when document type is not 1 or 4, 5

2. support the scenario:
"investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
This commit is contained in:
Blade He 2025-04-03 17:06:43 -05:00
parent 4b896f4460
commit f333cc30f5
5 changed files with 128 additions and 52 deletions

View File

@ -1,9 +1,42 @@
{ {
"management_fee_including_performance_fee": { "management_fee_including_performance_fee": {
"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"], "details": [
"effective_datapoints": ["management_fee_and_costs"], {"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
"exclude_datapoints": ["performance_fee_costs"], "effective_datapoints": ["management_fee_and_costs"],
"provider_ids": ["0C00005549"], "exclude_datapoints": ["performance_fee_costs"]},
"provider_names": ["Vision Super Pty Ltd"] {"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
"effective_datapoints": ["performance_fee_costs"],
"exclude_datapoints": ["management_fee_and_costs"]}
],
"provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK",
"0C000069VJ", "0C0000AL58", "0C00006B9E",
"0C00006BDB", "0C00006BDD", "0C00006BDG",
"0C000035YC", "0C0000CSKN", "0C00005549",
"0C000051C6", "0C00008JA0", "0C000093Z4",
"0C0000B5L6", "0C00006EGK", "0C00006EJI",
"0C00006FYL", "0C00006G0Q", "0C00006GIF",
"0C00006GNW", "0C00006GPU", "0C00006H46",
"0C00006H4J", "0C00006H4Q", "0C0000A5XQ",
"0C0000BBPL", "0C0000C2MS", "0C0000CVRL",
"0C0000AV6P", "0C00001XXQ", "0C00001XYR",
"0C00006AZB", "0C00006BN6", "0C00006BXE",
"0C00006CIK", "0C00006CJ2", "0C00006DOA",
"0C0000CAQF", "0C0000CAQH", "0C0000CAQO",
"0C0000CAQR"],
"provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd",
"Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd",
"RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd",
"JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd",
"Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd",
"GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd",
"Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd",
"Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd",
"Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd",
"Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd",
"Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd",
"AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd",
"Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd",
"KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd",
"PostSuper Pty Ltd", "Legal Super Pty Ltd"]
} }
} }

View File

@ -99,7 +99,7 @@ class DataExtraction:
len(list(self.special_datapoint_feature_config.keys())) == 0: len(list(self.special_datapoint_feature_config.keys())) == 0:
return special_datapoint_feature return special_datapoint_feature
for feature in list(self.special_datapoint_feature_config.keys()): for feature in list(self.special_datapoint_feature_config.keys()):
special_datapoint_feature[feature] = {"page_index": []} special_datapoint_feature[feature] = {}
return special_datapoint_feature return special_datapoint_feature
def get_document_category_production(self): def get_document_category_production(self):
@ -153,7 +153,6 @@ class DataExtraction:
pass pass
return fund_name return fund_name
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict: def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
""" """
If document source is aus_propectus and document category is MIS If document source is aus_propectus and document category is MIS
@ -558,6 +557,8 @@ class DataExtraction:
""" """
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value. If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
""" """
if len(self.fund_name_list) < 3:
return data_list, []
raw_name_dict = self.get_raw_name_dict(data_list) raw_name_dict = self.get_raw_name_dict(data_list)
raw_name_list = list(raw_name_dict.keys()) raw_name_list = list(raw_name_dict.keys())
if len(raw_name_list) < 3: if len(raw_name_list) < 3:
@ -729,10 +730,34 @@ class DataExtraction:
def post_management_fee_exclude_performance_fee(self, data_list: list): def post_management_fee_exclude_performance_fee(self, data_list: list):
adjust = False adjust = False
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\ mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("page_index", []) get("page_index", [])
if len(mangement_fee_index_list) == 0: if len(mangement_fee_index_list) == 0:
return data_list, adjust return data_list, adjust
effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("datapoint", "")
if effective_datapoint == "performance_fee_costs":
mangement_fee_index_list = []
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
data = data_dict.get("extract_data", {}).get("data", [])
for data_item in data:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint == "management_fee_and_costs"]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint == "performance_fee_costs"]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if exist_effective_datapoints and not exist_exclude_datapoints:
if page_index not in mangement_fee_index_list:
mangement_fee_index_list.append(page_index)
min_page_index = min(mangement_fee_index_list) min_page_index = min(mangement_fee_index_list)
performance_fee_item_list = [] performance_fee_item_list = []
for data_dict in data_list: for data_dict in data_list:
@ -759,7 +784,7 @@ class DataExtraction:
keys = list(management_fee_data.keys()) keys = list(management_fee_data.keys())
fund_name = management_fee_data.get("fund_name", "") fund_name = management_fee_data.get("fund_name", "")
share_name = management_fee_data.get("share_name", "") share_name = management_fee_data.get("share_name", "")
if len(fund_name) == 0 or len(share_name) == 0: if fund_name == "" or share_name == "":
continue continue
if "management_fee_and_costs" in keys: if "management_fee_and_costs" in keys:
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1) management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
@ -771,7 +796,9 @@ class DataExtraction:
for performance_fee_item in performance_fee_item_list: for performance_fee_item in performance_fee_item_list:
pf_fund_name = performance_fee_item.get("fund_name", "") pf_fund_name = performance_fee_item.get("fund_name", "")
pf_share_name = performance_fee_item.get("share_name", "") pf_share_name = performance_fee_item.get("share_name", "")
if pf_fund_name == fund_name and pf_share_name == share_name: if pf_fund_name == "" or pf_share_name == "":
continue
if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1) performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
try: try:
performance_fee_costs = float(performance_fee_costs) performance_fee_costs = float(performance_fee_costs)
@ -943,7 +970,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num not in [37, 38]: # if page_num not in [42]:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -1692,16 +1719,15 @@ class DataExtraction:
new_data_list.append(new_data) new_data_list.append(new_data)
extract_data_info["data"] = new_data_list extract_data_info["data"] = new_data_list
if page_text is not None and len(page_text) > 0: if page_text is not None and len(page_text) > 0:
self.set_datapoint_feature_properties(new_data_list, page_text, page_num) try:
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
except Exception as e:
logger.error(f"Error in setting datapoint feature properties: {e}")
return extract_data_info return extract_data_info
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None: def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
for feature, properties in self.special_datapoint_feature_config.items(): for feature, properties in self.special_datapoint_feature_config.items():
regex_text_list = properties.get("regex_text", []) if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
if len(regex_text_list) == 0:
continue
effective_datapoints = properties.get("effective_datapoints", [])
if len(effective_datapoints) == 0:
continue continue
provider_ids = properties.get("provider_ids", []) provider_ids = properties.get("provider_ids", [])
if len(provider_ids) > 0: if len(provider_ids) > 0:
@ -1714,36 +1740,51 @@ class DataExtraction:
break break
if not is_current_provider: if not is_current_provider:
continue continue
exclude_datapoints = properties.get("exclude_datapoints", []) detail_list = properties.get("details", [])
if len(detail_list) == 0:
exist_effective_datapoints = False continue
exist_exclude_datapoints = False set_feature_property = False
for data_item in data_list: for detail in detail_list:
datapoints = [datapoint for datapoint in list(data_item.keys()) regex_text_list = detail.get("regex_text", [])
if datapoint in effective_datapoints] if len(regex_text_list) == 0:
if len(datapoints) > 0: continue
exist_effective_datapoints = True effective_datapoints = detail.get("effective_datapoints", [])
datapoints = [datapoint for datapoint in list(data_item.keys()) if len(effective_datapoints) == 0:
if datapoint in exclude_datapoints] continue
if len(datapoints) > 0: exclude_datapoints = detail.get("exclude_datapoints", [])
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if not exist_effective_datapoints: exist_effective_datapoints = False
continue exist_exclude_datapoints = False
if exist_exclude_datapoints: for data_item in data_list:
continue datapoints = [datapoint for datapoint in list(data_item.keys())
found_regex_text = False if datapoint in effective_datapoints]
for regex_text in regex_text_list: if len(datapoints) > 0:
regex_search = re.search(regex_text, page_text, re.IGNORECASE) exist_effective_datapoints = True
if regex_search is not None: datapoints = [datapoint for datapoint in list(data_item.keys())
found_regex_text = True if datapoint in exclude_datapoints]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if not exist_effective_datapoints:
continue
if exist_exclude_datapoints:
continue
found_regex_text = False
for regex_text in regex_text_list:
regex_search = re.search(regex_text, page_text, re.IGNORECASE)
if regex_search is not None:
found_regex_text = True
break
if found_regex_text:
if self.special_datapoint_feature[feature].get("page_index", None) is None:
self.special_datapoint_feature[feature]["page_index"] = []
self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
self.special_datapoint_feature[feature]["page_index"].append(page_num)
set_feature_property = True
if set_feature_property:
break break
if found_regex_text:
if self.special_datapoint_feature[feature].get("page_index", None) is None:
self.special_datapoint_feature[feature]["page_index"] = []
self.special_datapoint_feature[feature]["page_index"].append(page_num)
def split_multi_share_name(self, raw_share_name: str) -> list: def split_multi_share_name(self, raw_share_name: str) -> list:
""" """

View File

@ -148,9 +148,9 @@ class FilterPages:
} }
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0] effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
document_type = self.document_mapping_info_df["DocumentType"].iloc[0] document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
if document_type in [4, 5]: if document_type in [4, 5] or self.doc_source == "emea_ar":
document_type = "ar" document_type = "ar"
elif document_type == 1: elif document_type == 1 or self.doc_source == "aus_prospectus":
document_type = "prospectus" document_type = "prospectus"
language_id = self.document_mapping_info_df["Language"].iloc[0] language_id = self.document_mapping_info_df["Language"].iloc[0]
language = self.language_config.get(language_id, None) language = self.language_config.get(language_id, None)

View File

@ -71,6 +71,7 @@
"If with multiple data values in same row, please extract the latest.", "If with multiple data values in same row, please extract the latest.",
"\n", "\n",
"4. Reported names:", "4. Reported names:",
"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
"Only output the values which with significant reported names.", "Only output the values which with significant reported names.",
"Multiple data columns with same reported name but different post-fix:", "Multiple data columns with same reported name but different post-fix:",
"If there are multiple reported names with different post-fix text, here is the priority rule:", "If there are multiple reported names with different post-fix text, here is the priority rule:",
@ -122,7 +123,7 @@
"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00", "total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.", "management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.", "management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.", "performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.", "buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.", "sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.", "establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",

11
main.py
View File

@ -1531,17 +1531,18 @@ if __name__ == "__main__":
# document_sample_file = ( # document_sample_file = (
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
# ) # )
document_sample_file = (
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
)
# document_sample_file = ( # document_sample_file = (
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt" # r"./sample_documents/aus_prospectus_46_documents_sample.txt"
# ) # )
document_sample_file = (
r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
)
logger.info(f"Start to run document sample file: {document_sample_file}") logger.info(f"Start to run document sample file: {document_sample_file}")
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0] if len(doc_id.strip()) > 0]
# special_doc_id_list = ["527969661"] # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
# special_doc_id_list = ["539999907", "455235248", "448576924"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (