1. fit the scenario when document type is not 1 or 4, 5

2. support the scenario: "investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
2025-04-03 17:06:43 -05:00 · 2025-04-03 17:06:43 -05:00 · f333cc30f5
parent 4b896f4460
commit f333cc30f5
5 changed files with 128 additions and 52 deletions
--- a/configuration/aus_prospectus/special_datapoint_feature.json
+++ b/configuration/aus_prospectus/special_datapoint_feature.json
@ -1,9 +1,42 @@
 {
    "management_fee_including_performance_fee": {
-        "regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
+        "details": [
            {"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
            "effective_datapoints": ["management_fee_and_costs"],
-        "exclude_datapoints": ["performance_fee_costs"],
+            "exclude_datapoints": ["performance_fee_costs"]},
-        "provider_ids": ["0C00005549"],
+            {"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
-        "provider_names": ["Vision Super Pty Ltd"]
+            "effective_datapoints": ["performance_fee_costs"],
            "exclude_datapoints": ["management_fee_and_costs"]}
        ],
        "provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK", 
        "0C000069VJ", "0C0000AL58", "0C00006B9E", 
        "0C00006BDB", "0C00006BDD", "0C00006BDG", 
        "0C000035YC", "0C0000CSKN", "0C00005549",
         "0C000051C6", "0C00008JA0", "0C000093Z4", 
         "0C0000B5L6", "0C00006EGK", "0C00006EJI", 
         "0C00006FYL", "0C00006G0Q", "0C00006GIF", 
         "0C00006GNW", "0C00006GPU", "0C00006H46", 
         "0C00006H4J", "0C00006H4Q", "0C0000A5XQ", 
         "0C0000BBPL", "0C0000C2MS", "0C0000CVRL", 
         "0C0000AV6P", "0C00001XXQ", "0C00001XYR", 
         "0C00006AZB", "0C00006BN6", "0C00006BXE", 
         "0C00006CIK", "0C00006CJ2", "0C00006DOA", 
         "0C0000CAQF", "0C0000CAQH", "0C0000CAQO", 
         "0C0000CAQR"],
        "provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd", 
        "Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd", 
        "RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd", 
        "JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd", 
        "Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd", 
        "GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd", 
        "Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd", 
        "Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd", 
        "Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd", 
        "Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd", 
        "Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd", 
        "AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd", 
        "Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd", 
        "KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd", 
        "PostSuper Pty Ltd", "Legal Super Pty Ltd"]
    }
 }
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -99,7 +99,7 @@ class DataExtraction:
            len(list(self.special_datapoint_feature_config.keys())) == 0:
            return special_datapoint_feature
        for feature in list(self.special_datapoint_feature_config.keys()):
-            special_datapoint_feature[feature] = {"page_index": []}
+            special_datapoint_feature[feature] = {}
        return special_datapoint_feature
    def get_document_category_production(self):
@ -153,7 +153,6 @@ class DataExtraction:
                        pass
        return fund_name
    def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
        """
        If document source is aus_propectus and document category is MIS
@ -558,6 +557,8 @@ class DataExtraction:
        """
        If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
        """
        if len(self.fund_name_list) < 3:
            return data_list, []
        raw_name_dict = self.get_raw_name_dict(data_list)
        raw_name_list = list(raw_name_dict.keys())
        if len(raw_name_list) < 3:
@ -729,10 +730,34 @@ class DataExtraction:
    def post_management_fee_exclude_performance_fee(self, data_list: list):
        adjust = False
        mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
            get("page_index", [])
        if len(mangement_fee_index_list) == 0:
            return data_list, adjust
        effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
            get("datapoint", "")
        if effective_datapoint == "performance_fee_costs":
            mangement_fee_index_list = []
            exist_effective_datapoints = False
            exist_exclude_datapoints = False
            for data_dict in data_list:
                page_index = data_dict.get("page_index", -1)
                data = data_dict.get("extract_data", {}).get("data", [])
                for data_item in data:
                    datapoints = [datapoint for datapoint in list(data_item.keys())
                                if datapoint == "management_fee_and_costs"]
                    if len(datapoints) > 0:
                        exist_effective_datapoints = True
                    datapoints = [datapoint for datapoint in list(data_item.keys())
                                if datapoint == "performance_fee_costs"]
                    if len(datapoints) > 0:
                        exist_exclude_datapoints = True
                    if exist_effective_datapoints and exist_exclude_datapoints:
                        break
                if exist_effective_datapoints and not exist_exclude_datapoints:
                    if page_index not in mangement_fee_index_list:
                        mangement_fee_index_list.append(page_index)
        min_page_index = min(mangement_fee_index_list)
        performance_fee_item_list = []
        for data_dict in data_list:
@ -759,7 +784,7 @@ class DataExtraction:
                keys = list(management_fee_data.keys())
                fund_name = management_fee_data.get("fund_name", "")
                share_name = management_fee_data.get("share_name", "")
-                if len(fund_name) == 0 or len(share_name) == 0:
+                if fund_name == "" or share_name == "":
                    continue
                if "management_fee_and_costs" in keys:
                    management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
@ -771,7 +796,9 @@ class DataExtraction:
                        for performance_fee_item in performance_fee_item_list:
                            pf_fund_name = performance_fee_item.get("fund_name", "")
                            pf_share_name = performance_fee_item.get("share_name", "")
-                            if pf_fund_name == fund_name and pf_share_name == share_name:
+                            if pf_fund_name == "" or pf_share_name == "":
                                continue
                            if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
                                performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
                                try:
                                    performance_fee_costs = float(performance_fee_costs)
@ -943,7 +970,7 @@ class DataExtraction:
        previous_page_datapoints = []
        previous_page_fund_name = None
        for page_num, page_text in self.page_text_dict.items():
-            # if page_num not in [37, 38]:
+            # if page_num not in [42]:
            #     continue
            if page_num in handled_page_num_list:
                continue
@ -1692,16 +1719,15 @@ class DataExtraction:
                    new_data_list.append(new_data)
        extract_data_info["data"] = new_data_list
        if page_text is not None and len(page_text) > 0:
            try:
                self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
            except Exception as e:
                logger.error(f"Error in setting datapoint feature properties: {e}")
        return extract_data_info
    def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
        for feature, properties in self.special_datapoint_feature_config.items():
-            regex_text_list = properties.get("regex_text", [])
+            if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
            if len(regex_text_list) == 0:
                continue
            effective_datapoints = properties.get("effective_datapoints", [])
            if len(effective_datapoints) == 0:
                continue
            provider_ids = properties.get("provider_ids", [])
            if len(provider_ids) > 0:
@ -1714,7 +1740,18 @@ class DataExtraction:
                            break
                if not is_current_provider:
                    continue
-            exclude_datapoints = properties.get("exclude_datapoints", [])
+            detail_list = properties.get("details", [])
            if len(detail_list) == 0:
                continue
            set_feature_property = False
            for detail in detail_list:
                regex_text_list = detail.get("regex_text", [])
                if len(regex_text_list) == 0:
                    continue
                effective_datapoints = detail.get("effective_datapoints", [])
                if len(effective_datapoints) == 0:
                    continue
                exclude_datapoints = detail.get("exclude_datapoints", [])
                exist_effective_datapoints = False
                exist_exclude_datapoints = False
@ -1743,7 +1780,11 @@ class DataExtraction:
                if found_regex_text:
                    if self.special_datapoint_feature[feature].get("page_index", None) is None:
                        self.special_datapoint_feature[feature]["page_index"] = []
                        self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
                    self.special_datapoint_feature[feature]["page_index"].append(page_num)
                    set_feature_property = True
                if set_feature_property:
                    break
    def split_multi_share_name(self, raw_share_name: str) -> list:
        """
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -148,9 +148,9 @@ class FilterPages:
            }
        effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
        document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
-        if document_type in [4, 5]:
+        if document_type in [4, 5] or self.doc_source == "emea_ar":
            document_type = "ar"
-        elif document_type == 1:
+        elif document_type == 1 or self.doc_source == "aus_prospectus":
            document_type = "prospectus"
        language_id = self.document_mapping_info_df["Language"].iloc[0]
        language = self.language_config.get(language_id, None)
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -71,6 +71,7 @@
 			"If with multiple data values in same row, please extract the latest.",
 			"\n",
 			"4. Reported names:",
 			"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
 			"Only output the values which with significant reported names.",
 			"Multiple data columns with same reported name but different post-fix:",
 			"If there are multiple reported names with different post-fix text, here is the priority rule:",
@ -122,7 +123,7 @@
 			"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
 			"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
 			"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
-			"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
+			"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
 			"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
 			"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
 			"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
--- a/main.py
+++ b/main.py
@ -1531,17 +1531,18 @@ if __name__ == "__main__":
        # document_sample_file = (
        #     r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
        # )
        document_sample_file = (
            r"./sample_documents/aus_prospectus_46_documents_sample.txt"
        )
        # document_sample_file = (
-        #     r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
+        #     r"./sample_documents/aus_prospectus_46_documents_sample.txt"
        # )
        document_sample_file = (
            r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
        )
        logger.info(f"Start to run document sample file: {document_sample_file}")
        with open(document_sample_file, "r", encoding="utf-8") as f:
            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
                                    if len(doc_id.strip()) > 0]
-        # special_doc_id_list = ["527969661"]
+        # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
        # special_doc_id_list = ["539999907", "455235248", "448576924"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (