1. fit the scenario when document type is not 1 or 4, 5

2. support the scenario: "investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
2025-04-03 17:06:43 -05:00 · 2025-04-03 17:06:43 -05:00 · f333cc30f5
parent 4b896f4460
commit f333cc30f5
5 changed files with 128 additions and 52 deletions
--- a/configuration/aus_prospectus/special_datapoint_feature.json
+++ b/configuration/aus_prospectus/special_datapoint_feature.json
@ -1,9 +1,42 @@
 {
    "management_fee_including_performance_fee": {
-        "regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
-        "effective_datapoints": ["management_fee_and_costs"],
-        "exclude_datapoints": ["performance_fee_costs"],
-        "provider_ids": ["0C00005549"],
-        "provider_names": ["Vision Super Pty Ltd"]
+        "details": [
+            {"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
+            "effective_datapoints": ["management_fee_and_costs"],
+            "exclude_datapoints": ["performance_fee_costs"]},
+            {"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
+            "effective_datapoints": ["performance_fee_costs"],
+            "exclude_datapoints": ["management_fee_and_costs"]}
+        ],
+        "provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK", 
+        "0C000069VJ", "0C0000AL58", "0C00006B9E", 
+        "0C00006BDB", "0C00006BDD", "0C00006BDG", 
+        "0C000035YC", "0C0000CSKN", "0C00005549",
+         "0C000051C6", "0C00008JA0", "0C000093Z4", 
+         "0C0000B5L6", "0C00006EGK", "0C00006EJI", 
+         "0C00006FYL", "0C00006G0Q", "0C00006GIF", 
+         "0C00006GNW", "0C00006GPU", "0C00006H46", 
+         "0C00006H4J", "0C00006H4Q", "0C0000A5XQ", 
+         "0C0000BBPL", "0C0000C2MS", "0C0000CVRL", 
+         "0C0000AV6P", "0C00001XXQ", "0C00001XYR", 
+         "0C00006AZB", "0C00006BN6", "0C00006BXE", 
+         "0C00006CIK", "0C00006CJ2", "0C00006DOA", 
+         "0C0000CAQF", "0C0000CAQH", "0C0000CAQO", 
+         "0C0000CAQR"],
+        "provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd", 
+        "Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd", 
+        "RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd", 
+        "JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd", 
+        "Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd", 
+        "GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd", 
+        "Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd", 
+        "Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd", 
+        "Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd", 
+        "Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd", 
+        "Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd", 
+        "AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd", 
+        "Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd", 
+        "KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd", 
+        "PostSuper Pty Ltd", "Legal Super Pty Ltd"]
    }
 }
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -99,7 +99,7 @@ class DataExtraction:
            len(list(self.special_datapoint_feature_config.keys())) == 0:
            return special_datapoint_feature
        for feature in list(self.special_datapoint_feature_config.keys()):
-            special_datapoint_feature[feature] = {"page_index": []}
+            special_datapoint_feature[feature] = {}
        return special_datapoint_feature
    
    def get_document_category_production(self):
@ -153,7 +153,6 @@ class DataExtraction:
                        pass
        return fund_name
        
-    
    def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
        """
        If document source is aus_propectus and document category is MIS
@ -558,6 +557,8 @@ class DataExtraction:
        """
        If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
        """
+        if len(self.fund_name_list) < 3:
+            return data_list, []
        raw_name_dict = self.get_raw_name_dict(data_list)
        raw_name_list = list(raw_name_dict.keys())
        if len(raw_name_list) < 3:
@ -729,10 +730,34 @@ class DataExtraction:
    
    def post_management_fee_exclude_performance_fee(self, data_list: list):
        adjust = False
+        
        mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
            get("page_index", [])
        if len(mangement_fee_index_list) == 0:
            return data_list, adjust
+        effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
+            get("datapoint", "")
+        if effective_datapoint == "performance_fee_costs":
+            mangement_fee_index_list = []
+            exist_effective_datapoints = False
+            exist_exclude_datapoints = False
+            for data_dict in data_list:
+                page_index = data_dict.get("page_index", -1)
+                data = data_dict.get("extract_data", {}).get("data", [])
+                for data_item in data:
+                    datapoints = [datapoint for datapoint in list(data_item.keys())
+                                if datapoint == "management_fee_and_costs"]
+                    if len(datapoints) > 0:
+                        exist_effective_datapoints = True
+                    datapoints = [datapoint for datapoint in list(data_item.keys())
+                                if datapoint == "performance_fee_costs"]
+                    if len(datapoints) > 0:
+                        exist_exclude_datapoints = True
+                    if exist_effective_datapoints and exist_exclude_datapoints:
+                        break
+                if exist_effective_datapoints and not exist_exclude_datapoints:
+                    if page_index not in mangement_fee_index_list:
+                        mangement_fee_index_list.append(page_index)
        min_page_index = min(mangement_fee_index_list)
        performance_fee_item_list = []
        for data_dict in data_list:
@ -759,7 +784,7 @@ class DataExtraction:
                keys = list(management_fee_data.keys())
                fund_name = management_fee_data.get("fund_name", "")
                share_name = management_fee_data.get("share_name", "")
-                if len(fund_name) == 0 or len(share_name) == 0:
+                if fund_name == "" or share_name == "":
                    continue
                if "management_fee_and_costs" in keys:
                    management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
@ -771,7 +796,9 @@ class DataExtraction:
                        for performance_fee_item in performance_fee_item_list:
                            pf_fund_name = performance_fee_item.get("fund_name", "")
                            pf_share_name = performance_fee_item.get("share_name", "")
-                            if pf_fund_name == fund_name and pf_share_name == share_name:
+                            if pf_fund_name == "" or pf_share_name == "":
+                                continue
+                            if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
                                performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
                                try:
                                    performance_fee_costs = float(performance_fee_costs)
@ -943,7 +970,7 @@ class DataExtraction:
        previous_page_datapoints = []
        previous_page_fund_name = None
        for page_num, page_text in self.page_text_dict.items():
-            # if page_num not in [37, 38]:
+            # if page_num not in [42]:
            #     continue
            if page_num in handled_page_num_list:
                continue
@ -1692,16 +1719,15 @@ class DataExtraction:
                    new_data_list.append(new_data)
        extract_data_info["data"] = new_data_list
        if page_text is not None and len(page_text) > 0:
-            self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
+            try:
+                self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
+            except Exception as e:
+                logger.error(f"Error in setting datapoint feature properties: {e}")
        return extract_data_info
    
    def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
        for feature, properties in self.special_datapoint_feature_config.items():
-            regex_text_list = properties.get("regex_text", [])
-            if len(regex_text_list) == 0:
-                continue
-            effective_datapoints = properties.get("effective_datapoints", [])
-            if len(effective_datapoints) == 0:
+            if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
                continue
            provider_ids = properties.get("provider_ids", [])
            if len(provider_ids) > 0:
@ -1714,36 +1740,51 @@ class DataExtraction:
                            break
                if not is_current_provider:
                    continue
-            exclude_datapoints = properties.get("exclude_datapoints", [])
-            
-            exist_effective_datapoints = False
-            exist_exclude_datapoints = False
-            for data_item in data_list:
-                datapoints = [datapoint for datapoint in list(data_item.keys())
-                              if datapoint in effective_datapoints]
-                if len(datapoints) > 0:
-                    exist_effective_datapoints = True
-                datapoints = [datapoint for datapoint in list(data_item.keys())
-                              if datapoint in exclude_datapoints]
-                if len(datapoints) > 0:
-                    exist_exclude_datapoints = True
-                if exist_effective_datapoints and exist_exclude_datapoints:
-                    break
+            detail_list = properties.get("details", [])
+            if len(detail_list) == 0:
+                continue
+            set_feature_property = False
+            for detail in detail_list:
+                regex_text_list = detail.get("regex_text", [])
+                if len(regex_text_list) == 0:
+                    continue
+                effective_datapoints = detail.get("effective_datapoints", [])
+                if len(effective_datapoints) == 0:
+                    continue
+                exclude_datapoints = detail.get("exclude_datapoints", [])
                
-            if not exist_effective_datapoints:
-                continue
-            if exist_exclude_datapoints:
-                continue
-            found_regex_text = False
-            for regex_text in regex_text_list:
-                regex_search = re.search(regex_text, page_text, re.IGNORECASE)
-                if regex_search is not None:
-                    found_regex_text = True
+                exist_effective_datapoints = False
+                exist_exclude_datapoints = False
+                for data_item in data_list:
+                    datapoints = [datapoint for datapoint in list(data_item.keys())
+                                if datapoint in effective_datapoints]
+                    if len(datapoints) > 0:
+                        exist_effective_datapoints = True
+                    datapoints = [datapoint for datapoint in list(data_item.keys())
+                                if datapoint in exclude_datapoints]
+                    if len(datapoints) > 0:
+                        exist_exclude_datapoints = True
+                    if exist_effective_datapoints and exist_exclude_datapoints:
+                        break
+                
+                if not exist_effective_datapoints:
+                    continue
+                if exist_exclude_datapoints:
+                    continue
+                found_regex_text = False
+                for regex_text in regex_text_list:
+                    regex_search = re.search(regex_text, page_text, re.IGNORECASE)
+                    if regex_search is not None:
+                        found_regex_text = True
+                        break
+                if found_regex_text:
+                    if self.special_datapoint_feature[feature].get("page_index", None) is None:
+                        self.special_datapoint_feature[feature]["page_index"] = []
+                        self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
+                    self.special_datapoint_feature[feature]["page_index"].append(page_num)
+                    set_feature_property = True
+                if set_feature_property:
                    break
-            if found_regex_text:
-                if self.special_datapoint_feature[feature].get("page_index", None) is None:
-                    self.special_datapoint_feature[feature]["page_index"] = []
-                self.special_datapoint_feature[feature]["page_index"].append(page_num)
        
    def split_multi_share_name(self, raw_share_name: str) -> list:
        """
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -148,9 +148,9 @@ class FilterPages:
            }
        effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
        document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
-        if document_type in [4, 5]:
+        if document_type in [4, 5] or self.doc_source == "emea_ar":
            document_type = "ar"
-        elif document_type == 1:
+        elif document_type == 1 or self.doc_source == "aus_prospectus":
            document_type = "prospectus"
        language_id = self.document_mapping_info_df["Language"].iloc[0]
        language = self.language_config.get(language_id, None)
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -71,6 +71,7 @@
 			"If with multiple data values in same row, please extract the latest.",
 			"\n",
 			"4. Reported names:",
+			"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
 			"Only output the values which with significant reported names.",
 			"Multiple data columns with same reported name but different post-fix:",
 			"If there are multiple reported names with different post-fix text, here is the priority rule:",
@ -122,7 +123,7 @@
 			"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
 			"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
 			"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
-			"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
+			"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
 			"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
 			"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
 			"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
--- a/main.py
+++ b/main.py
@ -1531,17 +1531,18 @@ if __name__ == "__main__":
        # document_sample_file = (
        #     r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
        # )
-        document_sample_file = (
-            r"./sample_documents/aus_prospectus_46_documents_sample.txt"
-        )
        # document_sample_file = (
-        #     r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
+        #     r"./sample_documents/aus_prospectus_46_documents_sample.txt"
        # )
+        document_sample_file = (
+            r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
+        )
        logger.info(f"Start to run document sample file: {document_sample_file}")
        with open(document_sample_file, "r", encoding="utf-8") as f:
            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
                                    if len(doc_id.strip()) > 0]
-        # special_doc_id_list = ["527969661"]
+        # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
+        # special_doc_id_list = ["539999907", "455235248", "448576924"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (