1. support re-call ChatGPT API to match non-matched prediction fund/ share names
2. If document fund amount less than 3, cancel the production name judgment logic
This commit is contained in:
parent
4cee95db9a
commit
427a379b3b
|
|
@ -653,67 +653,114 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
|||
llm_result = json_repair.loads(llm_response['response'])
|
||||
except:
|
||||
llm_result = {}
|
||||
# try:
|
||||
# llm_result = ast.literal_eval(llm_response['response'].replace('\n',''))
|
||||
# except Exception as e:
|
||||
# logger.info(f"error: {e}")
|
||||
# cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '')
|
||||
# llm_result = json.loads(cleaned_response)
|
||||
# logger.info(f"\n\n llm_result: {llm_result}")
|
||||
for pred_name,db_name in llm_result.items():
|
||||
# print("k: ",k)
|
||||
# print("v: ",v)
|
||||
og_db_index=-1
|
||||
# og_pred_index = -1
|
||||
og_pred_index_list = []
|
||||
if pred_name in cleaned_unmatched_pred_list:
|
||||
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
||||
if c_item==pred_name:
|
||||
og_pred_index_list.append(c_idx)
|
||||
# og_pred_index = cleaned_unmatched_pred_list.index(k)
|
||||
unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
|
||||
unmatched_pred_list,
|
||||
cleaned_unmatched_pred_list,
|
||||
unmatched_db_list,
|
||||
cleaned_unmatched_db_list,
|
||||
df_data,
|
||||
final_result,
|
||||
record_empty=False)
|
||||
"""
|
||||
For some cases, same document,
|
||||
perhaps same funds/ shares are with different raw names in different pages.
|
||||
e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
|
||||
But if only call ChatGPT API one time, it will not be able to match all of them.
|
||||
"""
|
||||
if len(unmantched_pred_index_list)>0:
|
||||
unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
|
||||
cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
|
||||
prompt_context = f"""
|
||||
{prompt_instruction}
|
||||
|
||||
if len(og_pred_index_list) == 0:
|
||||
# sometimes, the raw name and db name reversed from the LLM response
|
||||
if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
|
||||
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
||||
if c_item==db_name:
|
||||
og_pred_index_list.append(c_idx)
|
||||
# og_pred_index = cleaned_unmatched_pred_list.index(v)
|
||||
og_db_index = cleaned_unmatched_db_list.index(pred_name)
|
||||
# v and k are swapped
|
||||
temp = db_name
|
||||
db_name = pred_name
|
||||
pred_name = temp
|
||||
if len(og_pred_index_list)==0:
|
||||
continue
|
||||
# og_db_index = cleaned_unmatched_db_list.index(v)
|
||||
if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
|
||||
og_db_index = cleaned_unmatched_db_list.index(db_name)
|
||||
# print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
|
||||
# print("unmatched_db_list: ",unmatched_db_list)
|
||||
provider_name: {provider_name}
|
||||
|
||||
for i in df_data:
|
||||
for og_pred_index in og_pred_index_list:
|
||||
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
|
||||
if og_db_index!=-1:
|
||||
i['db_fund']=unmatched_db_list[og_db_index]
|
||||
i['cleaned_db_fund_name'] = db_name
|
||||
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
|
||||
else:
|
||||
i['db_fund'] = ''
|
||||
i['cleaned_db_fund_name'] = ''
|
||||
final_result.update({unmatched_pred_list[og_pred_index]:""})
|
||||
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
|
||||
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
|
||||
i['llm_pred_fund'] = pred_name
|
||||
i['llm_matched_db_name'] = db_name
|
||||
i['llm_result'] = llm_result
|
||||
break
|
||||
prediction_fund:
|
||||
{cleaned_unmatched_pred_list}
|
||||
|
||||
true_fund:
|
||||
{cleaned_unmatched_db_list}
|
||||
"""
|
||||
llm_response, with_error = chat(
|
||||
prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
|
||||
)
|
||||
# logger.info(f"fund matching LLM Response: {llm_response}")
|
||||
if 'response' in llm_response.keys():
|
||||
try:
|
||||
llm_result = json.loads(llm_response['response'])
|
||||
except:
|
||||
try:
|
||||
llm_result = json_repair.loads(llm_response['response'])
|
||||
except:
|
||||
llm_result = {}
|
||||
unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
|
||||
unmatched_pred_list,
|
||||
cleaned_unmatched_pred_list,
|
||||
unmatched_db_list,
|
||||
cleaned_unmatched_db_list,
|
||||
df_data,
|
||||
final_result,
|
||||
record_empty=True)
|
||||
|
||||
# break
|
||||
return final_result
|
||||
|
||||
|
||||
def post_handle_fund_matching_call(llm_result,
|
||||
unmatched_pred_list,
|
||||
cleaned_unmatched_pred_list,
|
||||
unmatched_db_list,
|
||||
cleaned_unmatched_db_list,
|
||||
df_data,
|
||||
final_result,
|
||||
record_empty: bool = False):
|
||||
unmantched_pred_index_list = []
|
||||
for pred_name,db_name in llm_result.items():
|
||||
og_db_index=-1
|
||||
og_pred_index_list = []
|
||||
if pred_name in cleaned_unmatched_pred_list:
|
||||
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
||||
if c_item==pred_name:
|
||||
og_pred_index_list.append(c_idx)
|
||||
|
||||
if len(og_pred_index_list) == 0:
|
||||
# sometimes, the raw name and db name reversed from the LLM response
|
||||
if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
|
||||
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
||||
if c_item==db_name:
|
||||
og_pred_index_list.append(c_idx)
|
||||
og_db_index = cleaned_unmatched_db_list.index(pred_name)
|
||||
# v and k are swapped
|
||||
temp = db_name
|
||||
db_name = pred_name
|
||||
pred_name = temp
|
||||
if len(og_pred_index_list)==0:
|
||||
continue
|
||||
if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
|
||||
og_db_index = cleaned_unmatched_db_list.index(db_name)
|
||||
|
||||
for i in df_data:
|
||||
for og_pred_index in og_pred_index_list:
|
||||
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
|
||||
if og_db_index!=-1:
|
||||
i['db_fund']=unmatched_db_list[og_db_index]
|
||||
i['cleaned_db_fund_name'] = db_name
|
||||
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
|
||||
else:
|
||||
unmantched_pred_index_list.append(og_pred_index)
|
||||
i['db_fund'] = ''
|
||||
i['cleaned_db_fund_name'] = ''
|
||||
if record_empty:
|
||||
final_result.update({unmatched_pred_list[og_pred_index]:""})
|
||||
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
|
||||
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
|
||||
i['llm_pred_fund'] = pred_name
|
||||
i['llm_matched_db_name'] = db_name
|
||||
i['llm_result'] = llm_result
|
||||
break
|
||||
return unmantched_pred_index_list
|
||||
|
||||
|
||||
|
||||
def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
|
||||
result = api_response['data']
|
||||
doc_fund_names = [item['fund_name'] for item in result]
|
||||
|
|
|
|||
|
|
@ -560,6 +560,8 @@ class DataExtraction:
|
|||
"""
|
||||
raw_name_dict = self.get_raw_name_dict(data_list)
|
||||
raw_name_list = list(raw_name_dict.keys())
|
||||
if len(raw_name_list) < 3:
|
||||
return data_list, []
|
||||
raw_name_as_production_name = None
|
||||
for raw_name in raw_name_list:
|
||||
if self.is_production_name(raw_name):
|
||||
|
|
@ -716,6 +718,8 @@ class DataExtraction:
|
|||
raw_name = self.get_raw_name(fund_name, share_name)
|
||||
if len(raw_name) == 0:
|
||||
continue
|
||||
if raw_name.lower() in ["the fund", "sample fund"]:
|
||||
continue
|
||||
# if isinstance(self.document_production, str) and \
|
||||
# raw_name.lower() in self.document_production.lower():
|
||||
# continue
|
||||
|
|
@ -1732,7 +1736,7 @@ class DataExtraction:
|
|||
continue
|
||||
found_regex_text = False
|
||||
for regex_text in regex_text_list:
|
||||
regex_search = re.search(regex_text, page_text)
|
||||
regex_search = re.search(regex_text, page_text, re.IGNORECASE)
|
||||
if regex_search is not None:
|
||||
found_regex_text = True
|
||||
break
|
||||
|
|
|
|||
|
|
@ -499,7 +499,13 @@
|
|||
"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
|
||||
"---Example 4 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}"
|
||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
|
||||
"\n",
|
||||
"---Example 5 Start---",
|
||||
"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund – Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund – Class A \n0.96% 0.05%\n\n",
|
||||
"---Example 5 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
|
||||
],
|
||||
"performance_fee_costs": [
|
||||
"### Performance fees",
|
||||
|
|
|
|||
7
main.py
7
main.py
|
|
@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
|
|||
|
||||
|
||||
def test_post_adjust_extract_data():
|
||||
doc_id = "448576924"
|
||||
doc_id = "480854121"
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
@ -1534,11 +1534,14 @@ if __name__ == "__main__":
|
|||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
||||
)
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
|
||||
# )
|
||||
logger.info(f"Start to run document sample file: {document_sample_file}")
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||
if len(doc_id.strip()) > 0]
|
||||
# special_doc_id_list = ["384508026"]
|
||||
# special_doc_id_list = ["527969661"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,87 @@
|
|||
430229604
|
||||
430249980
|
||||
434533711
|
||||
448576798
|
||||
448576868
|
||||
448576914
|
||||
448576924
|
||||
448577874
|
||||
448577877
|
||||
448578148
|
||||
448701586
|
||||
448906715
|
||||
448906720
|
||||
448906722
|
||||
448907811
|
||||
451234748
|
||||
454947973
|
||||
454947982
|
||||
454948291
|
||||
454948296
|
||||
455232983
|
||||
455235248
|
||||
462770987
|
||||
470958290
|
||||
470958296
|
||||
478920274
|
||||
478946988
|
||||
479996914
|
||||
479996918
|
||||
480713037
|
||||
480726184
|
||||
480726185
|
||||
480854103
|
||||
480854105
|
||||
480854113
|
||||
480854115
|
||||
480854118
|
||||
480854120
|
||||
480854121
|
||||
480854129
|
||||
481877313
|
||||
484628699
|
||||
484628701
|
||||
484628702
|
||||
484628703
|
||||
495516375
|
||||
495547519
|
||||
500579230
|
||||
506913190
|
||||
509581748
|
||||
520698753
|
||||
520702746
|
||||
520703007
|
||||
521591949
|
||||
521606716
|
||||
521606755
|
||||
523516443
|
||||
525464665
|
||||
528208796
|
||||
534933875
|
||||
539999907
|
||||
539999916
|
||||
540028470
|
||||
542294088
|
||||
544886057
|
||||
548035617
|
||||
550533961
|
||||
550769189
|
||||
552727485
|
||||
555377021
|
||||
556527310
|
||||
557362550
|
||||
557526104
|
||||
557526108
|
||||
557526111
|
||||
557526129
|
||||
557526130
|
||||
557526143
|
||||
557526145
|
||||
562753667
|
||||
562753673
|
||||
562754590
|
||||
570781265
|
||||
572302455
|
||||
572302463
|
||||
573372424
|
||||
577949367
|
||||
Loading…
Reference in New Issue