From fa763f4f1424d561db0908ab25306e196b868903 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Thu, 24 Oct 2024 16:24:21 -0500
Subject: [PATCH] 1. optimize instructions 2. optimize mapping algorithm

---
 instructions/data_extraction_prompts_config.json | 9 ++++++++-
 main.py                                          | 4 ++--
 utils/biz_utils.py                               | 5 +++--
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json
index df48bfe..5b19973 100644
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@@ -103,7 +103,14 @@
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}",
 					"Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).",
-					"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns."
+					"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns.",
+					"Case 3:",
+					"If the value of column with latest date is N/A or -, please ignore.",
+					"-----Example Start-----",
+					"I-class income shares\n\n31.10.22\n30.04.22\n30.04.21\n30.04.20\n\npence per share\npence per share\npence per share\npence per share\nOther information\nOperating charges**\nN/A\n—\n0.90%\n0.90%",
+					"-----Example End-----",
+					"The output should be:",
+					"{\"data\": []}"
 				]
 			}
 		],
diff --git a/main.py b/main.py
index 20a191c..dca10df 100644
--- a/main.py
+++ b/main.py
@@ -854,10 +854,10 @@ if __name__ == "__main__":
     ]
     # special_doc_id_list = check_mapping_doc_id_list
     special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["543243654"]
+    special_doc_id_list = ["423418540"]
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = False
+    re_run_extract_data = True
     re_run_mapping_data = True
     force_save_total_data = False
     calculate_metrics = False
diff --git a/utils/biz_utils.py b/utils/biz_utils.py
index c89d600..22e780c 100644
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@@ -812,8 +812,6 @@ def replace_abbrevation(text: str):
             new_text_splits.append('Advantage')
         elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']:
             new_text_splits.append('Hedged')
-        elif split.lower() in ['unhgd']:
-            split = ""
         elif split.lower() in ['cl', 'cl.']:
             new_text_splits.append('Class')
         elif split.lower() in ['ser', 'ser.']:
@@ -824,8 +822,11 @@ def replace_abbrevation(text: str):
             new_text_splits.append('no trail')
         elif split.lower() in ['non']:
             new_text_splits.append('Not')
+        elif split.lower() in ['net', 'unhgd']:
+            new_text_splits.append('')
         else:
             new_text_splits.append(split)
     
     new_text = ' '.join(new_text_splits)
+    new_text = re.sub(r'\s+', ' ', new_text).strip()
     return new_text
\ No newline at end of file