From 95c386911c203b9b547163f82c9944f18b597c73 Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Wed, 4 Dec 2024 22:08:09 -0600
Subject: [PATCH] Clean fund name after getting response from ChatGPT

---
 core/data_extraction.py | 11 ++++++++++-
 main.py                 |  4 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/core/data_extraction.py b/core/data_extraction.py
index 7b34530..380affe 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -523,16 +523,25 @@ class DataExtraction:
             fund_name = data.get("fund name", "").strip()
             if fund_name == "":
                 remove_list.append(data)
+            
+            # Clean fund name start
             if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
                 previous_page_last_fund = previous_page_last_fund.strip()
                 if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund:
                     modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
                     if len(modified_fund_name.split()) > 1:
                         fund_name = modified_fund_name
-                    
+            remove_list = ["Market Specific Equity Sub-Funds", 
+                           "International and Regional Equity Sub-Funds",
+                           "Equity Sub-Funds"]
+            for remove_item in remove_list:
+                if fund_name.startswith(remove_item):
+                    fund_name = fund_name.replace(remove_item, "").strip()
             fund_name = self.get_fund_name(fund_name, "Fund")
             fund_name = self.get_fund_name(fund_name, "Bond")
             data["fund name"] = fund_name
+            # Clean fund name end
+            
             keys = list(data.keys())
             for key in keys:
                 if self.datapoint_level_config.get(key, "") == "share_level":
diff --git a/main.py b/main.py
index f847aec..c605065 100644
--- a/main.py
+++ b/main.py
@@ -1151,7 +1151,7 @@ def batch_run_documents():
         "534535767"
     ]
     special_doc_id_list = check_db_mapping_doc_id_list
-    special_doc_id_list = ["534535767"]
+    # special_doc_id_list = ["481482392"]
     pdf_folder = r"/data/emea_ar/pdf/"
     page_filter_ground_truth_file = (
         r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@@ -1162,7 +1162,7 @@ def batch_run_documents():
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
     re_run_extract_data = True
     re_run_mapping_data = True
-    force_save_total_data = False
+    force_save_total_data = True
     calculate_metrics = False
 
     extract_ways = ["text"]