diff --git a/configuration/aus_prospectus/special_datapoint_feature.json b/configuration/aus_prospectus/special_datapoint_feature.json index 49b7de4..b18477f 100644 --- a/configuration/aus_prospectus/special_datapoint_feature.json +++ b/configuration/aus_prospectus/special_datapoint_feature.json @@ -1,9 +1,42 @@ { "management_fee_including_performance_fee": { - "regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"], - "effective_datapoints": ["management_fee_and_costs"], - "exclude_datapoints": ["performance_fee_costs"], - "provider_ids": ["0C00005549"], - "provider_names": ["Vision Super Pty Ltd"] + "details": [ + {"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"], + "effective_datapoints": ["management_fee_and_costs"], + "exclude_datapoints": ["performance_fee_costs"]}, + {"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"], + "effective_datapoints": ["performance_fee_costs"], + "exclude_datapoints": ["management_fee_and_costs"]} + ], + "provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK", + "0C000069VJ", "0C0000AL58", "0C00006B9E", + "0C00006BDB", "0C00006BDD", "0C00006BDG", + "0C000035YC", "0C0000CSKN", "0C00005549", + "0C000051C6", "0C00008JA0", "0C000093Z4", + "0C0000B5L6", "0C00006EGK", "0C00006EJI", + "0C00006FYL", "0C00006G0Q", "0C00006GIF", + "0C00006GNW", "0C00006GPU", "0C00006H46", + "0C00006H4J", "0C00006H4Q", "0C0000A5XQ", + "0C0000BBPL", "0C0000C2MS", "0C0000CVRL", + "0C0000AV6P", "0C00001XXQ", "0C00001XYR", + "0C00006AZB", "0C00006BN6", "0C00006BXE", + "0C00006CIK", "0C00006CJ2", "0C00006DOA", + "0C0000CAQF", "0C0000CAQH", "0C0000CAQO", + "0C0000CAQR"], + "provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd", + "Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd", + "RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd", + "JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd", + "Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd", + "GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd", + "Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd", + "Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd", + "Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd", + "Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd", + "Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd", + "AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd", + "Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd", + "KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd", + "PostSuper Pty Ltd", "Legal Super Pty Ltd"] } } \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index fab74ab..258f0a9 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -99,7 +99,7 @@ class DataExtraction: len(list(self.special_datapoint_feature_config.keys())) == 0: return special_datapoint_feature for feature in list(self.special_datapoint_feature_config.keys()): - special_datapoint_feature[feature] = {"page_index": []} + special_datapoint_feature[feature] = {} return special_datapoint_feature def get_document_category_production(self): @@ -153,7 +153,6 @@ class DataExtraction: pass return fund_name - def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict: """ If document source is aus_propectus and document category is MIS @@ -558,6 +557,8 @@ class DataExtraction: """ If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value. """ + if len(self.fund_name_list) < 3: + return data_list, [] raw_name_dict = self.get_raw_name_dict(data_list) raw_name_list = list(raw_name_dict.keys()) if len(raw_name_list) < 3: @@ -729,10 +730,34 @@ class DataExtraction: def post_management_fee_exclude_performance_fee(self, data_list: list): adjust = False + mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\ get("page_index", []) if len(mangement_fee_index_list) == 0: return data_list, adjust + effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\ + get("datapoint", "") + if effective_datapoint == "performance_fee_costs": + mangement_fee_index_list = [] + exist_effective_datapoints = False + exist_exclude_datapoints = False + for data_dict in data_list: + page_index = data_dict.get("page_index", -1) + data = data_dict.get("extract_data", {}).get("data", []) + for data_item in data: + datapoints = [datapoint for datapoint in list(data_item.keys()) + if datapoint == "management_fee_and_costs"] + if len(datapoints) > 0: + exist_effective_datapoints = True + datapoints = [datapoint for datapoint in list(data_item.keys()) + if datapoint == "performance_fee_costs"] + if len(datapoints) > 0: + exist_exclude_datapoints = True + if exist_effective_datapoints and exist_exclude_datapoints: + break + if exist_effective_datapoints and not exist_exclude_datapoints: + if page_index not in mangement_fee_index_list: + mangement_fee_index_list.append(page_index) min_page_index = min(mangement_fee_index_list) performance_fee_item_list = [] for data_dict in data_list: @@ -759,7 +784,7 @@ class DataExtraction: keys = list(management_fee_data.keys()) fund_name = management_fee_data.get("fund_name", "") share_name = management_fee_data.get("share_name", "") - if len(fund_name) == 0 or len(share_name) == 0: + if fund_name == "" or share_name == "": continue if "management_fee_and_costs" in keys: management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1) @@ -771,7 +796,9 @@ class DataExtraction: for performance_fee_item in performance_fee_item_list: pf_fund_name = performance_fee_item.get("fund_name", "") pf_share_name = performance_fee_item.get("share_name", "") - if pf_fund_name == fund_name and pf_share_name == share_name: + if pf_fund_name == "" or pf_share_name == "": + continue + if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower(): performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1) try: performance_fee_costs = float(performance_fee_costs) @@ -943,7 +970,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [37, 38]: + # if page_num not in [42]: # continue if page_num in handled_page_num_list: continue @@ -1692,16 +1719,15 @@ class DataExtraction: new_data_list.append(new_data) extract_data_info["data"] = new_data_list if page_text is not None and len(page_text) > 0: - self.set_datapoint_feature_properties(new_data_list, page_text, page_num) + try: + self.set_datapoint_feature_properties(new_data_list, page_text, page_num) + except Exception as e: + logger.error(f"Error in setting datapoint feature properties: {e}") return extract_data_info def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None: for feature, properties in self.special_datapoint_feature_config.items(): - regex_text_list = properties.get("regex_text", []) - if len(regex_text_list) == 0: - continue - effective_datapoints = properties.get("effective_datapoints", []) - if len(effective_datapoints) == 0: + if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None: continue provider_ids = properties.get("provider_ids", []) if len(provider_ids) > 0: @@ -1714,36 +1740,51 @@ class DataExtraction: break if not is_current_provider: continue - exclude_datapoints = properties.get("exclude_datapoints", []) - - exist_effective_datapoints = False - exist_exclude_datapoints = False - for data_item in data_list: - datapoints = [datapoint for datapoint in list(data_item.keys()) - if datapoint in effective_datapoints] - if len(datapoints) > 0: - exist_effective_datapoints = True - datapoints = [datapoint for datapoint in list(data_item.keys()) - if datapoint in exclude_datapoints] - if len(datapoints) > 0: - exist_exclude_datapoints = True - if exist_effective_datapoints and exist_exclude_datapoints: - break + detail_list = properties.get("details", []) + if len(detail_list) == 0: + continue + set_feature_property = False + for detail in detail_list: + regex_text_list = detail.get("regex_text", []) + if len(regex_text_list) == 0: + continue + effective_datapoints = detail.get("effective_datapoints", []) + if len(effective_datapoints) == 0: + continue + exclude_datapoints = detail.get("exclude_datapoints", []) - if not exist_effective_datapoints: - continue - if exist_exclude_datapoints: - continue - found_regex_text = False - for regex_text in regex_text_list: - regex_search = re.search(regex_text, page_text, re.IGNORECASE) - if regex_search is not None: - found_regex_text = True + exist_effective_datapoints = False + exist_exclude_datapoints = False + for data_item in data_list: + datapoints = [datapoint for datapoint in list(data_item.keys()) + if datapoint in effective_datapoints] + if len(datapoints) > 0: + exist_effective_datapoints = True + datapoints = [datapoint for datapoint in list(data_item.keys()) + if datapoint in exclude_datapoints] + if len(datapoints) > 0: + exist_exclude_datapoints = True + if exist_effective_datapoints and exist_exclude_datapoints: + break + + if not exist_effective_datapoints: + continue + if exist_exclude_datapoints: + continue + found_regex_text = False + for regex_text in regex_text_list: + regex_search = re.search(regex_text, page_text, re.IGNORECASE) + if regex_search is not None: + found_regex_text = True + break + if found_regex_text: + if self.special_datapoint_feature[feature].get("page_index", None) is None: + self.special_datapoint_feature[feature]["page_index"] = [] + self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0] + self.special_datapoint_feature[feature]["page_index"].append(page_num) + set_feature_property = True + if set_feature_property: break - if found_regex_text: - if self.special_datapoint_feature[feature].get("page_index", None) is None: - self.special_datapoint_feature[feature]["page_index"] = [] - self.special_datapoint_feature[feature]["page_index"].append(page_num) def split_multi_share_name(self, raw_share_name: str) -> list: """ diff --git a/core/page_filter.py b/core/page_filter.py index cd82137..ecffccc 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -148,9 +148,9 @@ class FilterPages: } effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0] document_type = self.document_mapping_info_df["DocumentType"].iloc[0] - if document_type in [4, 5]: + if document_type in [4, 5] or self.doc_source == "emea_ar": document_type = "ar" - elif document_type == 1: + elif document_type == 1 or self.doc_source == "aus_prospectus": document_type = "prospectus" language_id = self.document_mapping_info_df["Language"].iloc[0] language = self.language_config.get(language_id, None) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 37489a6..3d85007 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -71,6 +71,7 @@ "If with multiple data values in same row, please extract the latest.", "\n", "4. Reported names:", + "**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**", "Only output the values which with significant reported names.", "Multiple data columns with same reported name but different post-fix:", "If there are multiple reported names with different post-fix text, here is the priority rule:", @@ -122,7 +123,7 @@ "total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00", "management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.", "management_fee": "Management fee is belong to percentage number, the value should be less than 100.", - "performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.", + "performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.", "buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.", "sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.", "establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.", diff --git a/main.py b/main.py index 0a61596..6396ae0 100644 --- a/main.py +++ b/main.py @@ -1531,17 +1531,18 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" # ) - document_sample_file = ( - r"./sample_documents/aus_prospectus_46_documents_sample.txt" - ) # document_sample_file = ( - # r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt" + # r"./sample_documents/aus_prospectus_46_documents_sample.txt" # ) + document_sample_file = ( + r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt" + ) logger.info(f"Start to run document sample file: {document_sample_file}") with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() if len(doc_id.strip()) > 0] - # special_doc_id_list = ["527969661"] + # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"] + # special_doc_id_list = ["539999907", "455235248", "448576924"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (