From 7a41b0363445a0fbee3bbddfdd4b1dacd1e81c7f Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Tue, 12 Nov 2024 17:01:10 -0600
Subject: [PATCH] 1. optimize instructions for fund name 2. optimize drilldown
 logic

---
 .../data_extraction_prompts_config.json       |  9 ++++--
 main.py                                       |  6 ++--
 utils/pdf_util.py                             | 32 +++++++++++++++----
 3 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json
index 5a09b09..f1304e5 100644
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@@ -27,9 +27,14 @@
 			"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", 
 			"b. The sub-fund name may be as the first column or first row values in the table.",
 			"b.1 fund name example:",
-			"- context:",
+			"---- context:",
 			"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
-			"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
+			"---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
+			"c. The fund name should be the nearest fund for relevant share/ data point/ value.",
+			"c.1 fund name example:",
+			"---- context:",
+			"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26%  \n1.29%",
+			"---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
 			"- Only extract the latest data from context:",
 			"If with multiple data values in same row, please extract the latest.",
 			"- Reported names:",
diff --git a/main.py b/main.py
index 55ee1fd..176120b 100644
--- a/main.py
+++ b/main.py
@@ -1110,11 +1110,11 @@ if __name__ == "__main__":
                             "546046730",
                             "546919329"
                             ]
-    special_doc_id_list = ["479742284"]
+    special_doc_id_list = ["506326520"]
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = False
-    re_run_mapping_data = False
+    re_run_extract_data = True
+    re_run_mapping_data = True
     force_save_total_data = False
     calculate_metrics = False
 
diff --git a/utils/pdf_util.py b/utils/pdf_util.py
index 3040a87..bdadff2 100644
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@@ -374,18 +374,17 @@ class PDFUtil:
         if highlight_value is not None and len(highlight_value.strip()) > 0:
             pure_highlight_value = highlight_value.strip()
             
+            highlight_value_search_text = None
             if len(pure_highlight_value.split()) == 1 and \
                 (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
                 highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
-                highlight_value_search = re.search(highlight_value_regex, page_text)
+                highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
             else:
                 highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
-                highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
-                if highlight_value_search is None:
+                highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True)
+                if highlight_value_search_text is None:
                     highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
-                    highlight_value_search = re.search(highlight_value_regex, page_text)
-            if highlight_value_search is not None:
-                highlight_value_search_text = highlight_value_search.group()
+                    highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
                 
         annotation_data = {"pdf_file": self.simple_pdf_file,
                           "page_index": page_index,
@@ -433,6 +432,19 @@ class PDFUtil:
             annotation_data["matching_val_area"] = bbox_list
         return annotation_data
     
+    def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
+        if ignore_case:
+            highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE)
+        else:
+            highlight_value_search_iter = re.finditer(highlight_value_regex, page_text)
+        
+        highlight_value_search_text = None
+        for highlight_value_search in highlight_value_search_iter:
+            highlight_value_search_text = highlight_value_search.group().strip()
+            if highlight_value_search_text == raw_value:
+                return highlight_value_search_text
+        return highlight_value_search_text
+    
     def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
         if text is None or len(text) == 0:
             return text
@@ -442,7 +454,13 @@ class PDFUtil:
                 continue
             replace = r"\{0}".format(special_iter.group())
             if replace not in text:
-                text = re.sub(replace, r"\\W", text)
+                special_iter_text = special_iter.group()
+                if special_iter_text == ")" and text.strip()[-1] == ")" and \
+                    text.strip().count(")") == 1:
+                    text = text.replace(")", r"\)")
+                else:
+                    text = re.sub(replace, r"\\W", text)
+                
         text = re.sub(r"( ){2,}", " ", text)
         if match_special_char_after_space:
             text = text.replace(" ", r"\s*\W*")