From d79b05885d7a5f71983b8f24dd414bd5795e91b8 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Fri, 6 Dec 2024 14:50:34 -0600
Subject: [PATCH] optimize prompts for TOR

---
 configuration/datapoint_reported_name.json       | 2 +-
 instructions/data_extraction_prompts_config.json | 3 ++-
 main.py                                          | 9 ++++-----
 specific_calc_metrics.py                         | 8 +++++---
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json
index 9e115b1..e1b0342 100644
--- a/configuration/datapoint_reported_name.json
+++ b/configuration/datapoint_reported_name.json
@@ -176,7 +176,7 @@
             "Rotatie van de portefeuille",
             "POF",
             "Portefeuille omloop factor",
-            "Taux de rotation corrigé - Gecorrigeerde omloopsnelheid"
+            "Taux de rotation - Omloopsnelheid"
         ],
         "french": [
             "taux de rotation",
diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json
index b341774..c2a676a 100644
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@@ -70,7 +70,8 @@
 		"special_rule": {
 			"tor": [
 				"If there are multiple TOR reported names, here is the priority rules:",
-				"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"."
+				"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation - Omloopsnelheid\".",
+				"- With \"Omloopsnelheid\" and \"Gecorrigeerde omloopsnelheid\", pick up the values from \"Omloopsnelheid\"."
 			],
 			"ter": [
 				"If there are multiple TER value columns, here is the priority rules:",
diff --git a/main.py b/main.py
index 3e2e095..d965cfb 100644
--- a/main.py
+++ b/main.py
@@ -1122,7 +1122,6 @@ def batch_run_documents():
         "407275419",
         "337937633",
         "337293427",
-        "334584772",
         "404712928",
         "451063582",
         "451878128",
@@ -1151,7 +1150,7 @@ def batch_run_documents():
         "534535767"
     ]
     special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["422760156"]
+    # special_doc_id_list = ["407275419", "425595958", "451063582", "451878128"]
     pdf_folder = r"/data/emea_ar/pdf/"
     page_filter_ground_truth_file = (
         r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@@ -1160,9 +1159,9 @@ def batch_run_documents():
     output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = True
-    re_run_mapping_data = True
-    force_save_total_data = False
+    re_run_extract_data = False
+    re_run_mapping_data = False
+    force_save_total_data = True
     calculate_metrics = False
 
     extract_ways = ["text"]
diff --git a/specific_calc_metrics.py b/specific_calc_metrics.py
index 735fc71..620de34 100644
--- a/specific_calc_metrics.py
+++ b/specific_calc_metrics.py
@@ -62,9 +62,11 @@ def calculate_complex_document_metrics(verify_file_path: str, document_list: lis
     document_count = len(document_list) \
     if document_list is not None and len(document_list) > 0 \
         else len(data_df["doc_id"].unique())
-        
+    
+    verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
+    
     output_metrics_file = os.path.join(output_folder, 
-                                       f"complex_document_{document_count}_metrics.xlsx")
+                                       f"complex_{verify_file_name}_metrics.xlsx")
     with pd.ExcelWriter(output_metrics_file) as writer:
         metrics_df.to_excel(writer, index=False, sheet_name="metrics")
 
@@ -111,7 +113,7 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict:
 
 if __name__ == "__main__":
     file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
-    verify_file = "mapping_data_info_31_documents_by_text_first_round.xlsx"
+    verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
     verify_file_path = os.path.join(file_folder, verify_file)
     document_list = [
         "334584772",