1. support re-call ChatGPT API to match non-matched prediction fund/ share names

2. If document fund amount less than 3, cancel the production name judgment logic
This commit is contained in:
Blade He 2025-04-02 16:34:41 -05:00
parent 4cee95db9a
commit 427a379b3b
6 changed files with 258 additions and 111 deletions

View File

@ -653,66 +653,113 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
llm_result = json_repair.loads(llm_response['response']) llm_result = json_repair.loads(llm_response['response'])
except: except:
llm_result = {} llm_result = {}
# try: unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
# llm_result = ast.literal_eval(llm_response['response'].replace('\n','')) unmatched_pred_list,
# except Exception as e: cleaned_unmatched_pred_list,
# logger.info(f"error: {e}") unmatched_db_list,
# cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '') cleaned_unmatched_db_list,
# llm_result = json.loads(cleaned_response) df_data,
# logger.info(f"\n\n llm_result: {llm_result}") final_result,
for pred_name,db_name in llm_result.items(): record_empty=False)
# print("k: ",k) """
# print("v: ",v) For some cases, same document,
og_db_index=-1 perhaps same funds/ shares are with different raw names in different pages.
# og_pred_index = -1 e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
og_pred_index_list = [] But if only call ChatGPT API one time, it will not be able to match all of them.
if pred_name in cleaned_unmatched_pred_list: """
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): if len(unmantched_pred_index_list)>0:
if c_item==pred_name: unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
og_pred_index_list.append(c_idx) cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
# og_pred_index = cleaned_unmatched_pred_list.index(k) prompt_context = f"""
{prompt_instruction}
provider_name: {provider_name}
prediction_fund:
{cleaned_unmatched_pred_list}
if len(og_pred_index_list) == 0: true_fund:
# sometimes, the raw name and db name reversed from the LLM response {cleaned_unmatched_db_list}
if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list: """
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): llm_response, with_error = chat(
if c_item==db_name: prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
og_pred_index_list.append(c_idx) )
# og_pred_index = cleaned_unmatched_pred_list.index(v) # logger.info(f"fund matching LLM Response: {llm_response}")
og_db_index = cleaned_unmatched_db_list.index(pred_name) if 'response' in llm_response.keys():
# v and k are swapped try:
temp = db_name llm_result = json.loads(llm_response['response'])
db_name = pred_name except:
pred_name = temp try:
if len(og_pred_index_list)==0: llm_result = json_repair.loads(llm_response['response'])
continue except:
# og_db_index = cleaned_unmatched_db_list.index(v) llm_result = {}
if og_db_index == -1 and db_name in cleaned_unmatched_db_list: unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
og_db_index = cleaned_unmatched_db_list.index(db_name) unmatched_pred_list,
# print("og_db_index: ",og_db_index, cleaned_unmatched_db_list) cleaned_unmatched_pred_list,
# print("unmatched_db_list: ",unmatched_db_list) unmatched_db_list,
cleaned_unmatched_db_list,
for i in df_data: df_data,
for og_pred_index in og_pred_index_list: final_result,
if i['pred_fund']==unmatched_pred_list[og_pred_index]: record_empty=True)
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index] return final_result
i['cleaned_db_fund_name'] = db_name
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else: def post_handle_fund_matching_call(llm_result,
i['db_fund'] = '' unmatched_pred_list,
i['cleaned_db_fund_name'] = '' cleaned_unmatched_pred_list,
final_result.update({unmatched_pred_list[og_pred_index]:""}) unmatched_db_list,
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list cleaned_unmatched_db_list,
i['llm_clean_db_list'] = cleaned_unmatched_db_list, df_data,
i['llm_pred_fund'] = pred_name final_result,
i['llm_matched_db_name'] = db_name record_empty: bool = False):
i['llm_result'] = llm_result unmantched_pred_index_list = []
break for pred_name,db_name in llm_result.items():
og_db_index=-1
og_pred_index_list = []
# break if pred_name in cleaned_unmatched_pred_list:
return final_result for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==pred_name:
og_pred_index_list.append(c_idx)
if len(og_pred_index_list) == 0:
# sometimes, the raw name and db name reversed from the LLM response
if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==db_name:
og_pred_index_list.append(c_idx)
og_db_index = cleaned_unmatched_db_list.index(pred_name)
# v and k are swapped
temp = db_name
db_name = pred_name
pred_name = temp
if len(og_pred_index_list)==0:
continue
if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
og_db_index = cleaned_unmatched_db_list.index(db_name)
for i in df_data:
for og_pred_index in og_pred_index_list:
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index]
i['cleaned_db_fund_name'] = db_name
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else:
unmantched_pred_index_list.append(og_pred_index)
i['db_fund'] = ''
i['cleaned_db_fund_name'] = ''
if record_empty:
final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_pred_fund'] = pred_name
i['llm_matched_db_name'] = db_name
i['llm_result'] = llm_result
break
return unmantched_pred_index_list
def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names): def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
result = api_response['data'] result = api_response['data']

View File

@ -560,6 +560,8 @@ class DataExtraction:
""" """
raw_name_dict = self.get_raw_name_dict(data_list) raw_name_dict = self.get_raw_name_dict(data_list)
raw_name_list = list(raw_name_dict.keys()) raw_name_list = list(raw_name_dict.keys())
if len(raw_name_list) < 3:
return data_list, []
raw_name_as_production_name = None raw_name_as_production_name = None
for raw_name in raw_name_list: for raw_name in raw_name_list:
if self.is_production_name(raw_name): if self.is_production_name(raw_name):
@ -716,6 +718,8 @@ class DataExtraction:
raw_name = self.get_raw_name(fund_name, share_name) raw_name = self.get_raw_name(fund_name, share_name)
if len(raw_name) == 0: if len(raw_name) == 0:
continue continue
if raw_name.lower() in ["the fund", "sample fund"]:
continue
# if isinstance(self.document_production, str) and \ # if isinstance(self.document_production, str) and \
# raw_name.lower() in self.document_production.lower(): # raw_name.lower() in self.document_production.lower():
# continue # continue
@ -1732,7 +1736,7 @@ class DataExtraction:
continue continue
found_regex_text = False found_regex_text = False
for regex_text in regex_text_list: for regex_text in regex_text_list:
regex_search = re.search(regex_text, page_text) regex_search = re.search(regex_text, page_text, re.IGNORECASE)
if regex_search is not None: if regex_search is not None:
found_regex_text = True found_regex_text = True
break break

View File

@ -499,7 +499,13 @@
"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n", "\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
"---Example 4 End---", "---Example 4 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund Class A\", \"share name\": \"Allan Gray Australian Equity Fund Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}" "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund Class A\", \"share name\": \"Allan Gray Australian Equity Fund Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
"\n",
"---Example 5 Start---",
"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund Class A \n0.96% 0.05%\n\n",
"---Example 5 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
], ],
"performance_fee_costs": [ "performance_fee_costs": [
"### Performance fees", "### Performance fees",

View File

@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
def test_post_adjust_extract_data(): def test_post_adjust_extract_data():
doc_id = "448576924" doc_id = "480854121"
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1534,11 +1534,14 @@ if __name__ == "__main__":
document_sample_file = ( document_sample_file = (
r"./sample_documents/aus_prospectus_46_documents_sample.txt" r"./sample_documents/aus_prospectus_46_documents_sample.txt"
) )
# document_sample_file = (
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
# )
logger.info(f"Start to run document sample file: {document_sample_file}") logger.info(f"Start to run document sample file: {document_sample_file}")
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0] if len(doc_id.strip()) > 0]
# special_doc_id_list = ["384508026"] # special_doc_id_list = ["527969661"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,87 @@
430229604
430249980
434533711
448576798
448576868
448576914
448576924
448577874
448577877
448578148
448701586
448906715
448906720
448906722
448907811
451234748
454947973
454947982
454948291
454948296
455232983
455235248
462770987
470958290
470958296
478920274
478946988
479996914
479996918
480713037
480726184
480726185
480854103
480854105
480854113
480854115
480854118
480854120
480854121
480854129
481877313
484628699
484628701
484628702
484628703
495516375
495547519
500579230
506913190
509581748
520698753
520702746
520703007
521591949
521606716
521606755
523516443
525464665
528208796
534933875
539999907
539999916
540028470
542294088
544886057
548035617
550533961
550769189
552727485
555377021
556527310
557362550
557526104
557526108
557526111
557526129
557526130
557526143
557526145
562753667
562753673
562754590
570781265
572302455
572302463
573372424
577949367