1. support re-call ChatGPT API to match non-matched prediction fund/ share names

2. If document fund amount less than 3, cancel the production name judgment logic
2025-04-02 16:34:41 -05:00 · 2025-04-02 16:34:41 -05:00 · 427a379b3b
parent 4cee95db9a
commit 427a379b3b
6 changed files with 258 additions and 111 deletions
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@ -653,67 +653,114 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                    llm_result = json_repair.loads(llm_response['response'])
                except:
                    llm_result = {}
-            # try:
-            #     llm_result = ast.literal_eval(llm_response['response'].replace('\n',''))
-            # except Exception as e:
-            #     logger.info(f"error: {e}")
-            #     cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '')
-            #     llm_result = json.loads(cleaned_response)
-            # logger.info(f"\n\n llm_result: {llm_result}")            
-            for pred_name,db_name in llm_result.items():
-                # print("k: ",k)
-                # print("v: ",v)
-                og_db_index=-1
-                # og_pred_index = -1
-                og_pred_index_list = []
-                if pred_name in cleaned_unmatched_pred_list:
-                    for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
-                        if c_item==pred_name:
-                            og_pred_index_list.append(c_idx)
-                    # og_pred_index = cleaned_unmatched_pred_list.index(k)
+            unmantched_pred_index_list = post_handle_fund_matching_call(llm_result, 
+                                                                        unmatched_pred_list, 
+                                                                        cleaned_unmatched_pred_list, 
+                                                                        unmatched_db_list, 
+                                                                        cleaned_unmatched_db_list, 
+                                                                        df_data, 
+                                                                        final_result,
+                                                                        record_empty=False)
+            """
+            For some cases, same document, 
+            perhaps same funds/ shares are with different raw names in different pages.
+            e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
+            But if only call ChatGPT API one time, it will not be able to match all of them.
+            """
+            if len(unmantched_pred_index_list)>0:
+                unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
+                cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
+                prompt_context = f"""
+                {prompt_instruction}

-                if len(og_pred_index_list) == 0:
-                    # sometimes, the raw name and db name reversed from the LLM response
-                    if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
-                        for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
-                            if c_item==db_name:
-                                og_pred_index_list.append(c_idx)
-                        # og_pred_index = cleaned_unmatched_pred_list.index(v)
-                        og_db_index = cleaned_unmatched_db_list.index(pred_name)
-                        # v and k are swapped
-                        temp = db_name
-                        db_name = pred_name
-                        pred_name = temp
-                if len(og_pred_index_list)==0:
-                    continue
-                # og_db_index = cleaned_unmatched_db_list.index(v)
-                if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
-                    og_db_index = cleaned_unmatched_db_list.index(db_name)
-                # print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
-                # print("unmatched_db_list: ",unmatched_db_list)
+                provider_name: {provider_name}

-                for i in df_data:
-                    for og_pred_index in og_pred_index_list:
-                        if i['pred_fund']==unmatched_pred_list[og_pred_index]:
-                            if og_db_index!=-1:
-                                i['db_fund']=unmatched_db_list[og_db_index]
-                                i['cleaned_db_fund_name'] = db_name
-                                final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
-                            else:
-                                i['db_fund'] = ''
-                                i['cleaned_db_fund_name'] = ''
-                                final_result.update({unmatched_pred_list[og_pred_index]:""})
-                            i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
-                            i['llm_clean_db_list'] = cleaned_unmatched_db_list,
-                            i['llm_pred_fund'] = pred_name
-                            i['llm_matched_db_name'] = db_name
-                            i['llm_result'] = llm_result
-                            break
+                prediction_fund: 
+                {cleaned_unmatched_pred_list}
                
+                true_fund: 
+                {cleaned_unmatched_db_list}
+                """
+                llm_response, with_error = chat(
+                    prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
+                    )
+                # logger.info(f"fund matching LLM Response: {llm_response}")
+                if 'response' in llm_response.keys():
+                    try:
+                        llm_result = json.loads(llm_response['response'])
+                    except:
+                        try:
+                            llm_result = json_repair.loads(llm_response['response'])
+                        except:
+                            llm_result = {}
+                    unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
+                                                                            unmatched_pred_list, 
+                                                                            cleaned_unmatched_pred_list, 
+                                                                            unmatched_db_list, 
+                                                                            cleaned_unmatched_db_list, 
+                                                                            df_data, 
+                                                                            final_result,
+                                                                            record_empty=True)

-        # break
    return final_result

+
+def post_handle_fund_matching_call(llm_result, 
+                                   unmatched_pred_list, 
+                                   cleaned_unmatched_pred_list, 
+                                   unmatched_db_list, 
+                                   cleaned_unmatched_db_list, 
+                                   df_data, 
+                                   final_result,
+                                   record_empty: bool = False):
+    unmantched_pred_index_list = []
+    for pred_name,db_name in llm_result.items():
+        og_db_index=-1
+        og_pred_index_list = []
+        if pred_name in cleaned_unmatched_pred_list:
+            for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
+                if c_item==pred_name:
+                    og_pred_index_list.append(c_idx)
+        
+        if len(og_pred_index_list) == 0:
+            # sometimes, the raw name and db name reversed from the LLM response
+            if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
+                for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
+                    if c_item==db_name:
+                        og_pred_index_list.append(c_idx)
+                og_db_index = cleaned_unmatched_db_list.index(pred_name)
+                # v and k are swapped
+                temp = db_name
+                db_name = pred_name
+                pred_name = temp
+        if len(og_pred_index_list)==0:
+            continue
+        if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
+            og_db_index = cleaned_unmatched_db_list.index(db_name)
+        
+        for i in df_data:
+            for og_pred_index in og_pred_index_list:
+                if i['pred_fund']==unmatched_pred_list[og_pred_index]:
+                    if og_db_index!=-1:
+                        i['db_fund']=unmatched_db_list[og_db_index]
+                        i['cleaned_db_fund_name'] = db_name
+                        final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
+                    else:
+                        unmantched_pred_index_list.append(og_pred_index)
+                        i['db_fund'] = ''
+                        i['cleaned_db_fund_name'] = ''
+                        if record_empty:
+                            final_result.update({unmatched_pred_list[og_pred_index]:""})
+                    i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
+                    i['llm_clean_db_list'] = cleaned_unmatched_db_list,
+                    i['llm_pred_fund'] = pred_name
+                    i['llm_matched_db_name'] = db_name
+                    i['llm_result'] = llm_result
+                    break
+    return unmantched_pred_index_list
+
+
+
 def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
    result = api_response['data']
    doc_fund_names = [item['fund_name'] for item in result]
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -560,6 +560,8 @@ class DataExtraction:
        """
        raw_name_dict = self.get_raw_name_dict(data_list)
        raw_name_list = list(raw_name_dict.keys())
+        if len(raw_name_list) < 3:
+            return data_list, []
        raw_name_as_production_name = None
        for raw_name in raw_name_list:
            if self.is_production_name(raw_name):
@ -716,6 +718,8 @@ class DataExtraction:
                raw_name = self.get_raw_name(fund_name, share_name)
                if len(raw_name) == 0:
                    continue
+                if raw_name.lower() in ["the fund", "sample fund"]:
+                    continue
                # if isinstance(self.document_production, str) and \
                #     raw_name.lower() in self.document_production.lower():
                #     continue
@ -1732,7 +1736,7 @@ class DataExtraction:
                continue
            found_regex_text = False
            for regex_text in regex_text_list:
-                regex_search = re.search(regex_text, page_text)
+                regex_search = re.search(regex_text, page_text, re.IGNORECASE)
                if regex_search is not None:
                    found_regex_text = True
                    break
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -499,7 +499,13 @@
 				"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
 				"---Example 4 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}"
+				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
+				"\n",
+				"---Example 5 Start---",
+				"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund – Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund – Class A \n0.96% 0.05%\n\n",
+				"---Example 5 End---",
+				"The output should be:",
+				"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
 			],
 			"performance_fee_costs": [
 				"### Performance fees",
--- a/main.py
+++ b/main.py
@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():


 def test_post_adjust_extract_data():
-    doc_id = "448576924"
+    doc_id = "480854121"
    pdf_folder: str = r"/data/aus_prospectus/pdf/"
    output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder: str = (
@ -1534,11 +1534,14 @@ if __name__ == "__main__":
        document_sample_file = (
            r"./sample_documents/aus_prospectus_46_documents_sample.txt"
        )
+        # document_sample_file = (
+        #     r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
+        # )
        logger.info(f"Start to run document sample file: {document_sample_file}")
        with open(document_sample_file, "r", encoding="utf-8") as f:
            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
                                    if len(doc_id.strip()) > 0]
-        # special_doc_id_list = ["384508026"]
+        # special_doc_id_list = ["527969661"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (
--- a/performance.ipynb
+++ b/performance.ipynb
--- a/sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt
+++ b/sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt
@ -0,0 +1,87 @@
+430229604
+430249980
+434533711
+448576798
+448576868
+448576914
+448576924
+448577874
+448577877
+448578148
+448701586
+448906715
+448906720
+448906722
+448907811
+451234748
+454947973
+454947982
+454948291
+454948296
+455232983
+455235248
+462770987
+470958290
+470958296
+478920274
+478946988
+479996914
+479996918
+480713037
+480726184
+480726185
+480854103
+480854105
+480854113
+480854115
+480854118
+480854120
+480854121
+480854129
+481877313
+484628699
+484628701
+484628702
+484628703
+495516375
+495547519
+500579230
+506913190
+509581748
+520698753
+520702746
+520703007
+521591949
+521606716
+521606755
+523516443
+525464665
+528208796
+534933875
+539999907
+539999916
+540028470
+542294088
+544886057
+548035617
+550533961
+550769189
+552727485
+555377021
+556527310
+557362550
+557526104
+557526108
+557526111
+557526129
+557526130
+557526143
+557526145
+562753667
+562753673
+562754590
+570781265
+572302455
+572302463
+573372424
+577949367