1. optimize instructions

2. optimize mapping algorithm
2024-10-24 16:24:21 -05:00 · 2024-10-24 16:24:21 -05:00 · fa763f4f14
parent 53dadf61f4
commit fa763f4f14
3 changed files with 13 additions and 5 deletions
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -103,7 +103,14 @@
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}",
 					"Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).",
-					"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns."
+					"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns.",
+					"Case 3:",
+					"If the value of column with latest date is N/A or -, please ignore.",
+					"-----Example Start-----",
+					"I-class income shares\n\n31.10.22\n30.04.22\n30.04.21\n30.04.20\n\npence per share\npence per share\npence per share\npence per share\nOther information\nOperating charges**\nN/A\n—\n0.90%\n0.90%",
+					"-----Example End-----",
+					"The output should be:",
+					"{\"data\": []}"
 				]
 			}
 		],
--- a/main.py
+++ b/main.py
@ -854,10 +854,10 @@ if __name__ == "__main__":
    ]
    # special_doc_id_list = check_mapping_doc_id_list
    special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["543243654"]
+    special_doc_id_list = ["423418540"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = False
+    re_run_extract_data = True
    re_run_mapping_data = True
    force_save_total_data = False
    calculate_metrics = False
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -812,8 +812,6 @@ def replace_abbrevation(text: str):
            new_text_splits.append('Advantage')
        elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
            new_text_splits.append('Hedged')
-        elif split.lower() in ['unhgd']:
-            split = ""
        elif split.lower() in ['cl', 'cl.']:
            new_text_splits.append('Class')
        elif split.lower() in ['ser', 'ser.']:
@ -824,8 +822,11 @@ def replace_abbrevation(text: str):
            new_text_splits.append('no trail')
        elif split.lower() in ['non']:
            new_text_splits.append('Not')
+        elif split.lower() in ['net', 'unhgd']:
+            new_text_splits.append('')
        else:
            new_text_splits.append(split)
    
    new_text = ' '.join(new_text_splits)
+    new_text = re.sub(r'\s+', ' ', new_text).strip()
    return new_text