From fa763f4f1424d561db0908ab25306e196b868903 Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 24 Oct 2024 16:24:21 -0500 Subject: [PATCH] 1. optimize instructions 2. optimize mapping algorithm --- instructions/data_extraction_prompts_config.json | 9 ++++++++- main.py | 4 ++-- utils/biz_utils.py | 5 +++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index df48bfe..5b19973 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -103,7 +103,14 @@ "The output should be:", "{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}", "Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).", - "If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns." + "If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns.", + "Case 3:", + "If the value of column with latest date is N/A or -, please ignore.", + "-----Example Start-----", + "I-class income shares\n\n31.10.22\n30.04.22\n30.04.21\n30.04.20\n\npence per share\npence per share\npence per share\npence per share\nOther information\nOperating charges**\nN/A\n—\n0.90%\n0.90%", + "-----Example End-----", + "The output should be:", + "{\"data\": []}" ] } ], diff --git a/main.py b/main.py index 20a191c..dca10df 100644 --- a/main.py +++ b/main.py @@ -854,10 +854,10 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["543243654"] + special_doc_id_list = ["423418540"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = False + re_run_extract_data = True re_run_mapping_data = True force_save_total_data = False calculate_metrics = False diff --git a/utils/biz_utils.py b/utils/biz_utils.py index c89d600..22e780c 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -812,8 +812,6 @@ def replace_abbrevation(text: str): new_text_splits.append('Advantage') elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: new_text_splits.append('Hedged') - elif split.lower() in ['unhgd']: - split = "" elif split.lower() in ['cl', 'cl.']: new_text_splits.append('Class') elif split.lower() in ['ser', 'ser.']: @@ -824,8 +822,11 @@ def replace_abbrevation(text: str): new_text_splits.append('no trail') elif split.lower() in ['non']: new_text_splits.append('Not') + elif split.lower() in ['net', 'unhgd']: + new_text_splits.append('') else: new_text_splits.append(split) new_text = ' '.join(new_text_splits) + new_text = re.sub(r'\s+', ' ', new_text).strip() return new_text \ No newline at end of file