1. support re-call ChatGPT API to match non-matched prediction fund/ share names

2. If document fund amount less than 3, cancel the production name judgment logic
2025-04-02 16:34:41 -05:00 · 2025-04-02 16:34:41 -05:00 · 427a379b3b
parent 4cee95db9a
commit 427a379b3b
6 changed files with 258 additions and 111 deletions
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@ -653,66 +653,113 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                    llm_result = json_repair.loads(llm_response['response'])
                except:
                    llm_result = {}
-            # try:
+            unmantched_pred_index_list = post_handle_fund_matching_call(llm_result, 
-            #     llm_result = ast.literal_eval(llm_response['response'].replace('\n',''))
+                                                                        unmatched_pred_list, 
-            # except Exception as e:
+                                                                        cleaned_unmatched_pred_list, 
-            #     logger.info(f"error: {e}")
+                                                                        unmatched_db_list, 
-            #     cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '')
+                                                                        cleaned_unmatched_db_list, 
-            #     llm_result = json.loads(cleaned_response)
+                                                                        df_data, 
-            # logger.info(f"\n\n llm_result: {llm_result}")            
+                                                                        final_result,
-            for pred_name,db_name in llm_result.items():
+                                                                        record_empty=False)
-                # print("k: ",k)
+            """
-                # print("v: ",v)
+            For some cases, same document, 
-                og_db_index=-1
+            perhaps same funds/ shares are with different raw names in different pages.
-                # og_pred_index = -1
+            e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
-                og_pred_index_list = []
+            But if only call ChatGPT API one time, it will not be able to match all of them.
-                if pred_name in cleaned_unmatched_pred_list:
+            """
-                    for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
+            if len(unmantched_pred_index_list)>0:
-                        if c_item==pred_name:
+                unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
-                            og_pred_index_list.append(c_idx)
+                cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
-                    # og_pred_index = cleaned_unmatched_pred_list.index(k)
+                prompt_context = f"""
                {prompt_instruction}
                provider_name: {provider_name}
                prediction_fund: 
                {cleaned_unmatched_pred_list}
-                if len(og_pred_index_list) == 0:
+                true_fund: 
-                    # sometimes, the raw name and db name reversed from the LLM response
+                {cleaned_unmatched_db_list}
-                    if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
+                """
-                        for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
+                llm_response, with_error = chat(
-                            if c_item==db_name:
+                    prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
-                                og_pred_index_list.append(c_idx)
+                    )
-                        # og_pred_index = cleaned_unmatched_pred_list.index(v)
+                # logger.info(f"fund matching LLM Response: {llm_response}")
-                        og_db_index = cleaned_unmatched_db_list.index(pred_name)
+                if 'response' in llm_response.keys():
-                        # v and k are swapped
+                    try:
-                        temp = db_name
+                        llm_result = json.loads(llm_response['response'])
-                        db_name = pred_name
+                    except:
-                        pred_name = temp
+                        try:
-                if len(og_pred_index_list)==0:
+                            llm_result = json_repair.loads(llm_response['response'])
-                    continue
+                        except:
-                # og_db_index = cleaned_unmatched_db_list.index(v)
+                            llm_result = {}
-                if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
+                    unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
-                    og_db_index = cleaned_unmatched_db_list.index(db_name)
+                                                                            unmatched_pred_list, 
-                # print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
+                                                                            cleaned_unmatched_pred_list, 
-                # print("unmatched_db_list: ",unmatched_db_list)
+                                                                            unmatched_db_list, 
-                
+                                                                            cleaned_unmatched_db_list, 
-                for i in df_data:
+                                                                            df_data, 
-                    for og_pred_index in og_pred_index_list:
+                                                                            final_result,
-                        if i['pred_fund']==unmatched_pred_list[og_pred_index]:
+                                                                            record_empty=True)
-                            if og_db_index!=-1:
+
-                                i['db_fund']=unmatched_db_list[og_db_index]
+    return final_result
-                                i['cleaned_db_fund_name'] = db_name
+
-                                final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
+
-                            else:
+def post_handle_fund_matching_call(llm_result, 
-                                i['db_fund'] = ''
+                                   unmatched_pred_list, 
-                                i['cleaned_db_fund_name'] = ''
+                                   cleaned_unmatched_pred_list, 
-                                final_result.update({unmatched_pred_list[og_pred_index]:""})
+                                   unmatched_db_list, 
-                            i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
+                                   cleaned_unmatched_db_list, 
-                            i['llm_clean_db_list'] = cleaned_unmatched_db_list,
+                                   df_data, 
-                            i['llm_pred_fund'] = pred_name
+                                   final_result,
-                            i['llm_matched_db_name'] = db_name
+                                   record_empty: bool = False):
-                            i['llm_result'] = llm_result
+    unmantched_pred_index_list = []
-                            break
+    for pred_name,db_name in llm_result.items():
-                        
+        og_db_index=-1
-                
+        og_pred_index_list = []
-        # break
+        if pred_name in cleaned_unmatched_pred_list:
-    return final_result    
+            for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
                if c_item==pred_name:
                    og_pred_index_list.append(c_idx)
        if len(og_pred_index_list) == 0:
            # sometimes, the raw name and db name reversed from the LLM response
            if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
                for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
                    if c_item==db_name:
                        og_pred_index_list.append(c_idx)
                og_db_index = cleaned_unmatched_db_list.index(pred_name)
                # v and k are swapped
                temp = db_name
                db_name = pred_name
                pred_name = temp
        if len(og_pred_index_list)==0:
            continue
        if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
            og_db_index = cleaned_unmatched_db_list.index(db_name)
        for i in df_data:
            for og_pred_index in og_pred_index_list:
                if i['pred_fund']==unmatched_pred_list[og_pred_index]:
                    if og_db_index!=-1:
                        i['db_fund']=unmatched_db_list[og_db_index]
                        i['cleaned_db_fund_name'] = db_name
                        final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
                    else:
                        unmantched_pred_index_list.append(og_pred_index)
                        i['db_fund'] = ''
                        i['cleaned_db_fund_name'] = ''
                        if record_empty:
                            final_result.update({unmatched_pred_list[og_pred_index]:""})
                    i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
                    i['llm_clean_db_list'] = cleaned_unmatched_db_list,
                    i['llm_pred_fund'] = pred_name
                    i['llm_matched_db_name'] = db_name
                    i['llm_result'] = llm_result
                    break
    return unmantched_pred_index_list
 def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
    result = api_response['data']
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -560,6 +560,8 @@ class DataExtraction:
        """
        raw_name_dict = self.get_raw_name_dict(data_list)
        raw_name_list = list(raw_name_dict.keys())
        if len(raw_name_list) < 3:
            return data_list, []
        raw_name_as_production_name = None
        for raw_name in raw_name_list:
            if self.is_production_name(raw_name):
@ -716,6 +718,8 @@ class DataExtraction:
                raw_name = self.get_raw_name(fund_name, share_name)
                if len(raw_name) == 0:
                    continue
                if raw_name.lower() in ["the fund", "sample fund"]:
                    continue
                # if isinstance(self.document_production, str) and \
                #     raw_name.lower() in self.document_production.lower():
                #     continue
@ -1732,7 +1736,7 @@ class DataExtraction:
                continue
            found_regex_text = False
            for regex_text in regex_text_list:
-                regex_search = re.search(regex_text, page_text)
+                regex_search = re.search(regex_text, page_text, re.IGNORECASE)
                if regex_search is not None:
                    found_regex_text = True
                    break
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -499,7 +499,13 @@
 				"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
 				"---Example 4 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}"
+				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
 				"\n",
 				"---Example 5 Start---",
 				"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund – Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund – Class A \n0.96% 0.05%\n\n",
 				"---Example 5 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
 			],
 			"performance_fee_costs": [
 				"### Performance fees",
--- a/main.py
+++ b/main.py
@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
 def test_post_adjust_extract_data():
-    doc_id = "448576924"
+    doc_id = "480854121"
    pdf_folder: str = r"/data/aus_prospectus/pdf/"
    output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder: str = (
@ -1534,11 +1534,14 @@ if __name__ == "__main__":
        document_sample_file = (
            r"./sample_documents/aus_prospectus_46_documents_sample.txt"
        )
        # document_sample_file = (
        #     r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
        # )
        logger.info(f"Start to run document sample file: {document_sample_file}")
        with open(document_sample_file, "r", encoding="utf-8") as f:
            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
                                    if len(doc_id.strip()) > 0]
-        # special_doc_id_list = ["384508026"]
+        # special_doc_id_list = ["527969661"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (
--- a/performance.ipynb
+++ b/performance.ipynb
--- a/sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt
+++ b/sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt
@ -0,0 +1,87 @@
 430229604
 430249980
 434533711
 448576798
 448576868
 448576914
 448576924
 448577874
 448577877
 448578148
 448701586
 448906715
 448906720
 448906722
 448907811
 451234748
 454947973
 454947982
 454948291
 454948296
 455232983
 455235248
 462770987
 470958290
 470958296
 478920274
 478946988
 479996914
 479996918
 480713037
 480726184
 480726185
 480854103
 480854105
 480854113
 480854115
 480854118
 480854120
 480854121
 480854129
 481877313
 484628699
 484628701
 484628702
 484628703
 495516375
 495547519
 500579230
 506913190
 509581748
 520698753
 520702746
 520703007
 521591949
 521606716
 521606755
 523516443
 525464665
 528208796
 534933875
 539999907
 539999916
 540028470
 542294088
 544886057
 548035617
 550533961
 550769189
 552727485
 555377021
 556527310
 557362550
 557526104
 557526108
 557526111
 557526129
 557526130
 557526143
 557526145
 562753667
 562753673
 562754590
 570781265
 572302455
 572302463
 573372424
 577949367