1. support re-call ChatGPT API to match non-matched prediction fund/ share names
2. If document fund amount less than 3, cancel the production name judgment logic
This commit is contained in:
parent
4cee95db9a
commit
427a379b3b
|
|
@ -653,24 +653,74 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
llm_result = json_repair.loads(llm_response['response'])
|
llm_result = json_repair.loads(llm_response['response'])
|
||||||
except:
|
except:
|
||||||
llm_result = {}
|
llm_result = {}
|
||||||
# try:
|
unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
|
||||||
# llm_result = ast.literal_eval(llm_response['response'].replace('\n',''))
|
unmatched_pred_list,
|
||||||
# except Exception as e:
|
cleaned_unmatched_pred_list,
|
||||||
# logger.info(f"error: {e}")
|
unmatched_db_list,
|
||||||
# cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '')
|
cleaned_unmatched_db_list,
|
||||||
# llm_result = json.loads(cleaned_response)
|
df_data,
|
||||||
# logger.info(f"\n\n llm_result: {llm_result}")
|
final_result,
|
||||||
|
record_empty=False)
|
||||||
|
"""
|
||||||
|
For some cases, same document,
|
||||||
|
perhaps same funds/ shares are with different raw names in different pages.
|
||||||
|
e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
|
||||||
|
But if only call ChatGPT API one time, it will not be able to match all of them.
|
||||||
|
"""
|
||||||
|
if len(unmantched_pred_index_list)>0:
|
||||||
|
unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
|
||||||
|
cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
|
||||||
|
prompt_context = f"""
|
||||||
|
{prompt_instruction}
|
||||||
|
|
||||||
|
provider_name: {provider_name}
|
||||||
|
|
||||||
|
prediction_fund:
|
||||||
|
{cleaned_unmatched_pred_list}
|
||||||
|
|
||||||
|
true_fund:
|
||||||
|
{cleaned_unmatched_db_list}
|
||||||
|
"""
|
||||||
|
llm_response, with_error = chat(
|
||||||
|
prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
|
||||||
|
)
|
||||||
|
# logger.info(f"fund matching LLM Response: {llm_response}")
|
||||||
|
if 'response' in llm_response.keys():
|
||||||
|
try:
|
||||||
|
llm_result = json.loads(llm_response['response'])
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
llm_result = json_repair.loads(llm_response['response'])
|
||||||
|
except:
|
||||||
|
llm_result = {}
|
||||||
|
unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
|
||||||
|
unmatched_pred_list,
|
||||||
|
cleaned_unmatched_pred_list,
|
||||||
|
unmatched_db_list,
|
||||||
|
cleaned_unmatched_db_list,
|
||||||
|
df_data,
|
||||||
|
final_result,
|
||||||
|
record_empty=True)
|
||||||
|
|
||||||
|
return final_result
|
||||||
|
|
||||||
|
|
||||||
|
def post_handle_fund_matching_call(llm_result,
|
||||||
|
unmatched_pred_list,
|
||||||
|
cleaned_unmatched_pred_list,
|
||||||
|
unmatched_db_list,
|
||||||
|
cleaned_unmatched_db_list,
|
||||||
|
df_data,
|
||||||
|
final_result,
|
||||||
|
record_empty: bool = False):
|
||||||
|
unmantched_pred_index_list = []
|
||||||
for pred_name,db_name in llm_result.items():
|
for pred_name,db_name in llm_result.items():
|
||||||
# print("k: ",k)
|
|
||||||
# print("v: ",v)
|
|
||||||
og_db_index=-1
|
og_db_index=-1
|
||||||
# og_pred_index = -1
|
|
||||||
og_pred_index_list = []
|
og_pred_index_list = []
|
||||||
if pred_name in cleaned_unmatched_pred_list:
|
if pred_name in cleaned_unmatched_pred_list:
|
||||||
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
||||||
if c_item==pred_name:
|
if c_item==pred_name:
|
||||||
og_pred_index_list.append(c_idx)
|
og_pred_index_list.append(c_idx)
|
||||||
# og_pred_index = cleaned_unmatched_pred_list.index(k)
|
|
||||||
|
|
||||||
if len(og_pred_index_list) == 0:
|
if len(og_pred_index_list) == 0:
|
||||||
# sometimes, the raw name and db name reversed from the LLM response
|
# sometimes, the raw name and db name reversed from the LLM response
|
||||||
|
|
@ -678,7 +728,6 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
|
||||||
if c_item==db_name:
|
if c_item==db_name:
|
||||||
og_pred_index_list.append(c_idx)
|
og_pred_index_list.append(c_idx)
|
||||||
# og_pred_index = cleaned_unmatched_pred_list.index(v)
|
|
||||||
og_db_index = cleaned_unmatched_db_list.index(pred_name)
|
og_db_index = cleaned_unmatched_db_list.index(pred_name)
|
||||||
# v and k are swapped
|
# v and k are swapped
|
||||||
temp = db_name
|
temp = db_name
|
||||||
|
|
@ -686,11 +735,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
pred_name = temp
|
pred_name = temp
|
||||||
if len(og_pred_index_list)==0:
|
if len(og_pred_index_list)==0:
|
||||||
continue
|
continue
|
||||||
# og_db_index = cleaned_unmatched_db_list.index(v)
|
|
||||||
if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
|
if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
|
||||||
og_db_index = cleaned_unmatched_db_list.index(db_name)
|
og_db_index = cleaned_unmatched_db_list.index(db_name)
|
||||||
# print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
|
|
||||||
# print("unmatched_db_list: ",unmatched_db_list)
|
|
||||||
|
|
||||||
for i in df_data:
|
for i in df_data:
|
||||||
for og_pred_index in og_pred_index_list:
|
for og_pred_index in og_pred_index_list:
|
||||||
|
|
@ -700,8 +746,10 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
i['cleaned_db_fund_name'] = db_name
|
i['cleaned_db_fund_name'] = db_name
|
||||||
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
|
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
|
||||||
else:
|
else:
|
||||||
|
unmantched_pred_index_list.append(og_pred_index)
|
||||||
i['db_fund'] = ''
|
i['db_fund'] = ''
|
||||||
i['cleaned_db_fund_name'] = ''
|
i['cleaned_db_fund_name'] = ''
|
||||||
|
if record_empty:
|
||||||
final_result.update({unmatched_pred_list[og_pred_index]:""})
|
final_result.update({unmatched_pred_list[og_pred_index]:""})
|
||||||
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
|
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
|
||||||
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
|
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
|
||||||
|
|
@ -709,10 +757,9 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
|
||||||
i['llm_matched_db_name'] = db_name
|
i['llm_matched_db_name'] = db_name
|
||||||
i['llm_result'] = llm_result
|
i['llm_result'] = llm_result
|
||||||
break
|
break
|
||||||
|
return unmantched_pred_index_list
|
||||||
|
|
||||||
|
|
||||||
# break
|
|
||||||
return final_result
|
|
||||||
|
|
||||||
def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
|
def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
|
||||||
result = api_response['data']
|
result = api_response['data']
|
||||||
|
|
|
||||||
|
|
@ -560,6 +560,8 @@ class DataExtraction:
|
||||||
"""
|
"""
|
||||||
raw_name_dict = self.get_raw_name_dict(data_list)
|
raw_name_dict = self.get_raw_name_dict(data_list)
|
||||||
raw_name_list = list(raw_name_dict.keys())
|
raw_name_list = list(raw_name_dict.keys())
|
||||||
|
if len(raw_name_list) < 3:
|
||||||
|
return data_list, []
|
||||||
raw_name_as_production_name = None
|
raw_name_as_production_name = None
|
||||||
for raw_name in raw_name_list:
|
for raw_name in raw_name_list:
|
||||||
if self.is_production_name(raw_name):
|
if self.is_production_name(raw_name):
|
||||||
|
|
@ -716,6 +718,8 @@ class DataExtraction:
|
||||||
raw_name = self.get_raw_name(fund_name, share_name)
|
raw_name = self.get_raw_name(fund_name, share_name)
|
||||||
if len(raw_name) == 0:
|
if len(raw_name) == 0:
|
||||||
continue
|
continue
|
||||||
|
if raw_name.lower() in ["the fund", "sample fund"]:
|
||||||
|
continue
|
||||||
# if isinstance(self.document_production, str) and \
|
# if isinstance(self.document_production, str) and \
|
||||||
# raw_name.lower() in self.document_production.lower():
|
# raw_name.lower() in self.document_production.lower():
|
||||||
# continue
|
# continue
|
||||||
|
|
@ -1732,7 +1736,7 @@ class DataExtraction:
|
||||||
continue
|
continue
|
||||||
found_regex_text = False
|
found_regex_text = False
|
||||||
for regex_text in regex_text_list:
|
for regex_text in regex_text_list:
|
||||||
regex_search = re.search(regex_text, page_text)
|
regex_search = re.search(regex_text, page_text, re.IGNORECASE)
|
||||||
if regex_search is not None:
|
if regex_search is not None:
|
||||||
found_regex_text = True
|
found_regex_text = True
|
||||||
break
|
break
|
||||||
|
|
|
||||||
|
|
@ -499,7 +499,13 @@
|
||||||
"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
|
"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
|
||||||
"---Example 4 End---",
|
"---Example 4 End---",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}"
|
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
|
||||||
|
"\n",
|
||||||
|
"---Example 5 Start---",
|
||||||
|
"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund – Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund – Class A \n0.96% 0.05%\n\n",
|
||||||
|
"---Example 5 End---",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
|
||||||
],
|
],
|
||||||
"performance_fee_costs": [
|
"performance_fee_costs": [
|
||||||
"### Performance fees",
|
"### Performance fees",
|
||||||
|
|
|
||||||
7
main.py
7
main.py
|
|
@ -1448,7 +1448,7 @@ def get_aus_prospectus_document_category():
|
||||||
|
|
||||||
|
|
||||||
def test_post_adjust_extract_data():
|
def test_post_adjust_extract_data():
|
||||||
doc_id = "448576924"
|
doc_id = "480854121"
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
@ -1534,11 +1534,14 @@ if __name__ == "__main__":
|
||||||
document_sample_file = (
|
document_sample_file = (
|
||||||
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
|
||||||
)
|
)
|
||||||
|
# document_sample_file = (
|
||||||
|
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
|
||||||
|
# )
|
||||||
logger.info(f"Start to run document sample file: {document_sample_file}")
|
logger.info(f"Start to run document sample file: {document_sample_file}")
|
||||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
|
||||||
if len(doc_id.strip()) > 0]
|
if len(doc_id.strip()) > 0]
|
||||||
# special_doc_id_list = ["384508026"]
|
# special_doc_id_list = ["527969661"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,87 @@
|
||||||
|
430229604
|
||||||
|
430249980
|
||||||
|
434533711
|
||||||
|
448576798
|
||||||
|
448576868
|
||||||
|
448576914
|
||||||
|
448576924
|
||||||
|
448577874
|
||||||
|
448577877
|
||||||
|
448578148
|
||||||
|
448701586
|
||||||
|
448906715
|
||||||
|
448906720
|
||||||
|
448906722
|
||||||
|
448907811
|
||||||
|
451234748
|
||||||
|
454947973
|
||||||
|
454947982
|
||||||
|
454948291
|
||||||
|
454948296
|
||||||
|
455232983
|
||||||
|
455235248
|
||||||
|
462770987
|
||||||
|
470958290
|
||||||
|
470958296
|
||||||
|
478920274
|
||||||
|
478946988
|
||||||
|
479996914
|
||||||
|
479996918
|
||||||
|
480713037
|
||||||
|
480726184
|
||||||
|
480726185
|
||||||
|
480854103
|
||||||
|
480854105
|
||||||
|
480854113
|
||||||
|
480854115
|
||||||
|
480854118
|
||||||
|
480854120
|
||||||
|
480854121
|
||||||
|
480854129
|
||||||
|
481877313
|
||||||
|
484628699
|
||||||
|
484628701
|
||||||
|
484628702
|
||||||
|
484628703
|
||||||
|
495516375
|
||||||
|
495547519
|
||||||
|
500579230
|
||||||
|
506913190
|
||||||
|
509581748
|
||||||
|
520698753
|
||||||
|
520702746
|
||||||
|
520703007
|
||||||
|
521591949
|
||||||
|
521606716
|
||||||
|
521606755
|
||||||
|
523516443
|
||||||
|
525464665
|
||||||
|
528208796
|
||||||
|
534933875
|
||||||
|
539999907
|
||||||
|
539999916
|
||||||
|
540028470
|
||||||
|
542294088
|
||||||
|
544886057
|
||||||
|
548035617
|
||||||
|
550533961
|
||||||
|
550769189
|
||||||
|
552727485
|
||||||
|
555377021
|
||||||
|
556527310
|
||||||
|
557362550
|
||||||
|
557526104
|
||||||
|
557526108
|
||||||
|
557526111
|
||||||
|
557526129
|
||||||
|
557526130
|
||||||
|
557526143
|
||||||
|
557526145
|
||||||
|
562753667
|
||||||
|
562753673
|
||||||
|
562754590
|
||||||
|
570781265
|
||||||
|
572302455
|
||||||
|
572302463
|
||||||
|
573372424
|
||||||
|
577949367
|
||||||
Loading…
Reference in New Issue