optimized for phase 2 data

2025-02-18 18:52:26 -06:00 · 2025-02-18 18:52:26 -06:00 · 705933bbdd
parent 353bc28599
commit 705933bbdd
5 changed files with 108 additions and 21 deletions
--- a/configuration/aus_prospectus/datapoint_keyword.json
+++ b/configuration/aus_prospectus/datapoint_keyword.json
@ -1,7 +1,7 @@
 {
  "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
-  "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
-  "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
+  "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs", "Management costs"]},
+  "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
  "performance_fee": {"english": ["performance fee", "performance fees"]},
  "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
  "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
--- a/configuration/aus_prospectus/datapoint_reported_name.json
+++ b/configuration/aus_prospectus/datapoint_reported_name.json
@ -1,7 +1,7 @@
 {
    "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
-    "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs"]},
-    "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
+    "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs"]},
+    "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
    "performance_fee": {"english": ["performance fee", "performance fees"]},
    "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
    "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -205,6 +205,76 @@ class DataExtraction:
        data_dict["completion_token"] = result.get("completion_token", 0)
        data_dict["total_token"] = result.get("total_token", 0)
        """
+        data_list = self.post_adjust_management_fee_costs(data_list)
+        data_list = self.supplement_minimum_initial_investment(data_list)
+        
+        return data_list
+    
+    def post_adjust_management_fee_costs(self, data_list: list):
+        management_fee_costs_list = []
+        management_fee_list = []
+        for data_dict in data_list:
+            extract_data = data_dict.get("extract_data", {})
+            data = extract_data.get("data", [])
+            for data_item in data:
+                keys = list(data_item.keys())
+                fund_name = data_item.get("fund_name", "")
+                share_name = data_item.get("share_name", "")
+                if fund_name == "" or share_name == "":
+                    continue
+                if "management_fee" in keys:
+                    management_fee = data_item.get("management_fee", -1)
+                    if management_fee != -1:
+                        found = False
+                        for mf in management_fee_list:
+                            if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
+                                mf_value = mf.get("management_fee", -1)
+                                if mf_value != -1 and mf_value >= management_fee:
+                                    mf["management_fee"] = management_fee
+                                    found = True
+                                    break
+                        if not found:
+                            management_fee_list.append({"fund_name": fund_name, 
+                                                        "share_name": share_name, 
+                                                        "management_fee": management_fee})
+                if "management_fee_and_costs" in keys:
+                    management_fee_costs = data_item.get("management_fee_and_costs", -1)
+                    if management_fee_costs != -1:
+                        found = False
+                        for mfc in management_fee_costs_list:
+                            if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
+                                mfc_value = mfc.get("management_fee_and_costs", -1)
+                                if mfc_value != -1 and mfc_value <= management_fee_costs:
+                                    mfc["management_fee_and_costs"] = management_fee_costs
+                                    found = True
+                                    break
+                        if not found:
+                            management_fee_costs_list.append({"fund_name": fund_name,
+                                                              "share_name": share_name,
+                                                              "management_fee_and_costs": management_fee_costs})
+        for data_dict in data_list:
+            extract_data = data_dict.get("extract_data", {})
+            data = extract_data.get("data", [])
+            for data_item in data:
+                keys = list(data_item.keys())
+                fund_name = data_item.get("fund_name", "")
+                share_name = data_item.get("share_name", "")
+                if fund_name == "" or share_name == "":
+                    continue
+                if "management_fee" in keys:
+                    for mf in management_fee_list:
+                        if mf.get("fund_name", "") == fund_name and mf.get("share_name", "") == share_name:
+                            data_item["management_fee"] = mf.get("management_fee", -1)
+                            break
+                if "management_fee_and_costs" in keys:
+                    for mfc in management_fee_costs_list:
+                        if mfc.get("fund_name", "") == fund_name and mfc.get("share_name", "") == share_name:
+                            data_item["management_fee_and_costs"] = mfc.get("management_fee_and_costs", -1)
+                            break
+        return data_list
+        
+    
+    def supplement_minimum_initial_investment(self, data_list: list):
        exist_minimum_initial_investment = False
        minimum_initial_investment = -1
        mii_fund_name = ""
@ -241,8 +311,6 @@ class DataExtraction:
                new_mii_data_list.append(new_data_dict)
            mii_dict["extract_data"]["data"] = new_mii_data_list
        return data_list
-        
-        

    def extract_data_by_text(self) -> dict:
        """
@ -318,12 +386,18 @@ class DataExtraction:
                                should_continue = True
                            else:
                                for next_datapoint in next_datapoints:
-                                    if next_datapoint not in page_datapoints:
-                                        should_continue = True
-                                        break
-                                next_datapoints.extend(page_datapoints)
-                                # remove duplicate datapoints
-                                next_datapoints = list(set(next_datapoints))
+                                    if self.doc_source == "aus_prospectus":
+                                        if next_datapoint in page_datapoints:
+                                            should_continue = False
+                                            break
+                                    else:
+                                        if next_datapoint not in page_datapoints:
+                                            should_continue = True
+                                            break
+                                if should_continue:
+                                    next_datapoints.extend(page_datapoints)
+                                    # remove duplicate datapoints
+                                    next_datapoints = list(set(next_datapoints))
                            if not should_continue:
                                break
                        if extract_way == "text":
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -146,7 +146,19 @@
 				"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
 				"---Example 2 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]"
+				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]",
+				"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
+				"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
+				"---Example 1 Start---",
+				"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n",
+				"---Example 1 End---",
+				"For this case, the first \"Entry Fee Option\" value is 0.47, the first \"Estimated Other investment costs\" value is 0.02, the sum is 0.49, so the output should be:",
+				"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]",
+				"---Example 2 Start---",
+				"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n",
+				"---Example 2 End---",
+				"For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:",
+				"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]"
 			],
 			"buy_spread": [
 				"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
--- a/main.py
+++ b/main.py
@ -1042,8 +1042,8 @@ def batch_run_documents(
    page_filter_ground_truth_file = (
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
    )
-    re_run_extract_data = True
-    re_run_mapping_data = True
+    re_run_extract_data = False
+    re_run_mapping_data = False
    force_save_total_data = True
    calculate_metrics = False

@ -1397,16 +1397,17 @@ if __name__ == "__main__":
        # document_sample_file = (
        #     r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
        # )
-        # document_sample_file = (
-        #     r"./sample_documents/aus_prospectus_17_documents_sample.txt"
-        # )
        document_sample_file = (
-            r"./sample_documents/aus_prospectus_52_documents_sample.txt"
+            r"./sample_documents/aus_prospectus_17_documents_sample.txt"
        )
+        # document_sample_file = (
+        #     r"./sample_documents/aus_prospectus_52_documents_sample.txt"
+        # )
        with open(document_sample_file, "r", encoding="utf-8") as f:
            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
        # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
-        document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
+        # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
+        document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
        # special_doc_id_list: list = [
        #     "539790009",
        #     "542300403",
@ -1420,7 +1421,7 @@ if __name__ == "__main__":
        #     "555377021",
        #     "555654388",
        # ]
-        # special_doc_id_list: list = ["377377369"]
+        # special_doc_id_list: list = ["401212184"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (