1. optimize instructions

2. optimize mapping algorithm
This commit is contained in:
Blade He 2024-10-24 16:24:21 -05:00
parent 53dadf61f4
commit fa763f4f14
3 changed files with 13 additions and 5 deletions

View File

@ -103,7 +103,14 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}", "{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}",
"Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).", "Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).",
"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns." "If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns.",
"Case 3:",
"If the value of column with latest date is N/A or -, please ignore.",
"-----Example Start-----",
"I-class income shares\n\n31.10.22\n30.04.22\n30.04.21\n30.04.20\n\npence per share\npence per share\npence per share\npence per share\nOther information\nOperating charges**\nN/A\n—\n0.90%\n0.90%",
"-----Example End-----",
"The output should be:",
"{\"data\": []}"
] ]
} }
], ],

View File

@ -854,10 +854,10 @@ if __name__ == "__main__":
] ]
# special_doc_id_list = check_mapping_doc_id_list # special_doc_id_list = check_mapping_doc_id_list
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["543243654"] special_doc_id_list = ["423418540"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = False
calculate_metrics = False calculate_metrics = False

View File

@ -812,8 +812,6 @@ def replace_abbrevation(text: str):
new_text_splits.append('Advantage') new_text_splits.append('Advantage')
elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
new_text_splits.append('Hedged') new_text_splits.append('Hedged')
elif split.lower() in ['unhgd']:
split = ""
elif split.lower() in ['cl', 'cl.']: elif split.lower() in ['cl', 'cl.']:
new_text_splits.append('Class') new_text_splits.append('Class')
elif split.lower() in ['ser', 'ser.']: elif split.lower() in ['ser', 'ser.']:
@ -824,8 +822,11 @@ def replace_abbrevation(text: str):
new_text_splits.append('no trail') new_text_splits.append('no trail')
elif split.lower() in ['non']: elif split.lower() in ['non']:
new_text_splits.append('Not') new_text_splits.append('Not')
elif split.lower() in ['net', 'unhgd']:
new_text_splits.append('')
else: else:
new_text_splits.append(split) new_text_splits.append(split)
new_text = ' '.join(new_text_splits) new_text = ' '.join(new_text_splits)
new_text = re.sub(r'\s+', ' ', new_text).strip()
return new_text return new_text