1. get production name

2. if some data point with production name, set each fund/ share with relevant data point value(s)
2025-02-27 12:07:49 -06:00 · 2025-02-27 12:07:49 -06:00 · 543cab74e1
parent 412692e1c4
commit 543cab74e1
4 changed files with 158 additions and 18 deletions
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -74,12 +74,13 @@ class DataExtraction:
        self.datapoint_name_config = self.get_datapoint_name()
        self.datapoint_reported_name_config, self.non_english_reported_name_config = \
            self.get_datapoint_reported_name()
-        self.document_category = self.get_document_category()
+        self.document_category, self.document_production = self.get_document_category_production()
        self.extract_way = extract_way
        self.output_image_folder = output_image_folder
-    def get_document_category(self):
+    def get_document_category_production(self):
        document_category = None
        document_production = None
        if self.doc_source == "aus_prospectus":
            first_4_page_text = ""
            for page_index, page_text in self.page_text_dict.items():
@ -100,10 +101,11 @@ class DataExtraction:
                    try:
                        data = json.loads(response)
                        document_category = data.get("document_category", None)
                        document_production = data.get("document_production", None)
                    except:
                        pass
-        return document_category
+        return document_category, document_production
    def get_investment_objective_pages(self):
        investment_objective_pages = []
@ -237,12 +239,89 @@ class DataExtraction:
        data_dict["completion_token"] = result.get("completion_token", 0)
        data_dict["total_token"] = result.get("total_token", 0)
        """
        data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
        data_list = self.remove_duplicate_data(data_list)
        if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
            data_list = self.post_adjust_management_fee_costs(data_list)
        if "minimum_initial_investment" not in datapoint_list_with_production_name:
            data_list = self.supplement_minimum_initial_investment(data_list)
        return data_list
    def post_adjust_for_value_with_production_name(self, data_list: list):
        """
        If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
        """
        raw_name_dict = self.get_raw_name_dict(data_list)
        raw_name_list = list(raw_name_dict.keys())
        raw_name_as_production_name = None
        for raw_name in raw_name_list:
            if raw_name.lower() in self.document_production.lower():
                raw_name_as_production_name = raw_name
                break
        if raw_name_as_production_name is None:
            return data_list
        raw_name_dict.pop(raw_name_as_production_name)
        datapoint_list_with_production_name = []
        for data_dict in data_list:
            extract_data = data_dict.get("extract_data", {})
            data = extract_data.get("data", [])
            remove_item_list = []
            new_dp_item_list = []
            for data_item in data:
                keys = list(data_item.keys())
                fund_name = data_item.get("fund_name", "")
                share_name = data_item.get("share_name", "")
                raw_name = self.get_raw_name(fund_name, share_name)
                if raw_name.lower() in self.document_production.lower():
                    dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
                    for dp_key in dp_keys:
                        if dp_key not in datapoint_list_with_production_name:
                            datapoint_list_with_production_name.append(dp_key)
                    remove_item_list.append(data_item)
                    for v_raw_name, v_dict in raw_name_dict.items():
                        v_fund_name = v_dict.get("fund_name", "")
                        v_share_name = v_dict.get("share_name", "")
                        if len(share_name) > 0:
                            new_dp_item = {"fund_name": v_fund_name, "share_name": v_share_name}
                        else:
                            new_dp_item = {"fund_name": v_fund_name}
                        for dp_key in dp_keys:
                            new_dp_item[dp_key] = data_item.get(dp_key, "")
                        new_dp_item["source"] = "from_production_name"
                        new_dp_item_list.append(new_dp_item)
            for remove_item in remove_item_list:
                if remove_item in extract_data["data"]:
                    extract_data["data"].remove(remove_item)
            if len(new_dp_item_list) > 0:
                extract_data["data"].extend(new_dp_item_list)
        if len(datapoint_list_with_production_name) == 0:
            return data_list
        for data_dict in data_list:
            extract_data = data_dict.get("extract_data", {})
            data = extract_data.get("data", [])
            remove_item_list = []
            for data_item in data:
                source = data_item.get("source", "")
                if source == "from_production_name":
                    continue
                keys = list(data_item.keys())
                dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
                for dp_key in dp_keys:
                    if dp_key in datapoint_list_with_production_name:
                        data_item.pop(dp_key)
                keys = list(data_item.keys())
                dp_keys = [key for key in keys if key not in ["fund_name", "share_name"]]
                if len(dp_keys) == 0:
                    remove_item_list.append(data_item)
            for remove_item in remove_item_list:
                if remove_item in extract_data["data"]:
                    extract_data["data"].remove(remove_item)
        return data_list, datapoint_list_with_production_name
    def remove_duplicate_data(self, data_list: list):
        """
        The purpose is to remove duplicate data in the different pages.
@ -294,15 +373,43 @@ class DataExtraction:
    def get_raw_name(self, fund_name: str, share_name: str) -> str:
        raw_name = ""
        if len(fund_name) == 0:
            return raw_name
        if fund_name == share_name:
            raw_name = fund_name
-        elif share_name.startswith(fund_name):
+        elif len(share_name) > 0 and share_name.startswith(fund_name):
            raw_name = share_name
        else:
-            raw_name = f"{fund_name} {share_name}"
+            raw_name = f"{fund_name} {share_name}".strip()
        return raw_name
    def get_raw_name_dict(self, data_list: list) -> dict:
        raw_name_dict = {}
        for data_dict in data_list:
            extract_data = data_dict.get("extract_data", {})
            data = extract_data.get("data", [])
            for data_item in data:
                fund_name = data_item.get("fund_name", "")
                share_name = data_item.get("share_name", "")
                raw_name = self.get_raw_name(fund_name, share_name)
                if len(raw_name) == 0:
                    continue
                # if isinstance(self.document_production, str) and \
                #     raw_name.lower() in self.document_production.lower():
                #     continue
                if raw_name not in list(raw_name_dict.keys()):
                    raw_name_dict[raw_name] = {"fund_name": fund_name, "share_name": share_name}
        return raw_name_dict
    def post_adjust_management_fee_costs(self, data_list: list):
        """
        Adjust the management fee and management fee and costs
        Because maybe the management fee and costs disclose in the first pages,
        and the management fee disclose in the next pages.
        According to biz rule, if can't find management fee when found management fee and costs, the management fee should be same as management fee and costs.
        if can't find management fee and costs when found management fee, the management fee and costs should be same as management fee.
        This function is to adjust the management fee and management fee and costs according to this case. 
        """
        management_fee_costs_list = []
        management_fee_list = []
        for data_dict in data_list:
@ -375,6 +482,10 @@ class DataExtraction:
    def supplement_minimum_initial_investment(self, data_list: list):
        """
        Minimum initial investment should be same as from every fund/ share class in the same document.
        This function is to supplement the minimum initial investment to each fund/ share class in the same document.
        """
        exist_minimum_initial_investment = False
        minimum_initial_investment = -1
        mii_dict = None
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -177,6 +177,14 @@
 				"For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:",
 				"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]}"
 			],
 			"administration_fees":[
 				"Administration fees and costs is share class level data.",
 				"----Example 1 Start----",
 				"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
 				"----Example 1 End----",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}"
 			],
 			"buy_spread": [
 				"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
 				"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
--- a/instructions/aus_prospectus/document_category_prompts.json
+++ b/instructions/aus_prospectus/document_category_prompts.json
@ -1,5 +1,6 @@
 {
 	"prompts": [
        "1. Identify document category: Super or MIS\n",
        "In a prospectus for an MIS(Managed Investment Scheme) product you’ll typically see references to a “responsible entity”, a registration number (ARSN) and disclosures that comply with the Corporations Act’s regime for managed investment schemes (e.g. pooling of funds, unit trusts, detailed product disclosures, and rules on redemption).\n", 
        "In contrast, a prospectus or product disclosure statement for a Super(superannuation) product will refer to superannuation or MySuper, include terms related to compulsory employer contributions, tax concessions, and comply with superannuation-specific legislation and guidelines (for example, those issued by APRA or the ATO).\n",
        "In short, look at the headings, statutory references, product descriptions, and regulatory disclaimers: if they discuss “managed investment schemes” or “responsible entities” and related disclosure obligations under the Corporations Act, it’s an MIS document; if they mention superannuation, MySuper, employer contributions, and similar features, then it belongs to the Super regime.\n",
@ -8,6 +9,24 @@
        "If these keywords are not present, the document falls under the MIS regime.",
        "Please identify whether the document belongs to the Super or MIS regime according to the context, and output answer as JSON format.",
        "The example is: {\"document_category\": \"Super\"}\n",
        "\n",
        "2. Get production name from document context. \n",
        "The production name is the name of the relevant fund(s) product that the document is about. It is usually found in the title of the document or in the first few pages of the document.\n",
        "Please provide the production name as a string.\n",
        "----Example 1 context start----",
        "MLC MasterKey Super & \nPension Fundamentals \n\nYour Guide to what is included in the MLC MasterKey Super \n& Pension Fundamentals Product Disclosure Statement \n\nPreparation date \n\n30 September 2022 \n\nIssued by the Trustee \n\nNULIS Nominees \n(Australia) Limited \nABN 80 008 515 633 \nAFSL 236465 \n\nThe Fund \n\nMLC Super Fund \nABN 70 732 426 024 \n\nMLC MasterKey Super & \nPension Fundamentals \n\nProduct Disclosure Statement \n\nThe Insurer \n\nInsurance is issued by \nMLC Limited \nABN 90 000 000 \n402 AFSL 230694 \n\n1. About MLC MasterKey Super & Pension Fundamentals \n\nYou can use this Product Disclosure Statement (PDS) to find what you need to know \nabout your super and how we can help you reach your retirement goals",
        "----Example 1 context end----",
        "The output should be:", 
        "{\"document_production\": \"MLC MasterKey Super & Pension Fundamentals\"}\n",
        "\n",
        "----Example 2 context start----",
        "Pension \nProduct Disclosure \nStatement \n\nThis legalsuper Pension Product Disclosure Statement is \n\nissued by Legal Super Pty Ltd, Level 9, 627 Chapel Street, \nSouth Yarra, 3141 (ABN 37 004 455 789, AFSL 246315) \nas the Trustee for legalsuper ABN 60 346 078 879. \n\nIssued 1 April 2023\nLEGALSUPER PENSION \nPRODUCT DISCLOSURE STATEMENT \n\nIssued by Legal Super Pty Ltd \nLevel 9, 627 Chapel Street, South Yarra, 3141 \nABN 37 004 455 789, AFSL 246315, L0002585 \nlegalsuper ABN 60 346 078 879 (the Fund) \n\nFund Contact Details \n\nPhone: 1800 060 312 (8am to 8pm [AEST/AEDT] Monday to Friday)\nEmail: mail@legalsuper.com.au \nlegalsuper.com.au \n\nDate of preparation: 1 April 2023",
        "----Example 2 context end----",
        "The output should be:",
        "{\"document_production\": \"legalsuper Pension\"}\n",
        "\n",
        "3. Output format, please provide the answer with both of document_category and document_production format, here is the example:\n",
        "{\"document_category\": \"Super\", \"document_production\": \"ABC Superannuation Fund\"}\n",
        "Answer:\n"
    ]
 }
--- a/main.py
+++ b/main.py
@ -1377,7 +1377,7 @@ def merge_output_data_aus_prospectus(
 def get_aus_prospectus_document_category():
    document_sample_file = (
-            r"./sample_documents/aus_prospectus_29_documents_sample.txt"
+            r"./sample_documents/aus_prospectus_17_documents_sample.txt"
        )
    with open(document_sample_file, "r", encoding="utf-8") as f:
        special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
@ -1416,20 +1416,21 @@ def get_aus_prospectus_document_category():
            document_mapping_info_df=emea_ar_parsing.document_mapping_info_df,
            extract_way=extract_way
        )
-        logger.info(f"Document: {doc_id}, category: {data_extraction.document_category}")
+        logger.info(f"Document: {doc_id}, \ncategory: {data_extraction.document_category}, \nproduction: {data_extraction.document_production}")
-        document_category_dict[doc_id] = data_extraction.document_category
+        document_category_dict[doc_id] = {"category": data_extraction.document_category, "production": data_extraction.document_production}
    output_extract_document_category_folder: str = (
        r"/data/aus_prospectus/output/document_category/"
    )
    os.makedirs(output_extract_document_category_folder, exist_ok=True)
-    output_file = os.path.join(output_extract_document_category_folder, "29_documents_category.json")
+    document_sample_file_base_name = os.path.basename(document_sample_file).replace(".txt", "").replace("aus_prospectus_", "")
    output_file = os.path.join(output_extract_document_category_folder, f"{document_sample_file_base_name}_category_production.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(document_category_dict, f, ensure_ascii=False, indent=4)
-    logger.info(f"Document category: {document_category_dict}")
+    logger.info(f"Document category and production: {document_category_dict}")
-def test_remove_duplicate_extract_data():
+def test_post_adjust_extract_data():
-    doc_id = "369105359"
+    doc_id = "454036250"
    pdf_folder: str = r"/data/aus_prospectus/pdf/"
    output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder: str = (
@ -1467,11 +1468,12 @@ def test_remove_duplicate_extract_data():
    data_file_path = os.path.join(data_folder, data_file)
    with open(data_file_path, "r", encoding="utf-8") as f:
        data_list = json.load(f)
-    data_list = data_extraction.remove_duplicate_data(data_list)
+    # data_list = data_extraction.remove_duplicate_data(data_list)
    data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
 if __name__ == "__main__":
-    # test_remove_duplicate_extract_data()
+    # test_post_adjust_extract_data()
    # get_aus_prospectus_document_category()
    # test_data_extraction_metrics()
    # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_1_documents_by_text_20250226155259.xlsx"
@ -1531,7 +1533,7 @@ if __name__ == "__main__":
        #     "555377021",
        #     "555654388",
        # ]
-        # special_doc_id_list: list = ["471206458"]
+        # special_doc_id_list: list = ["454036250"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (