1. optimize instructions for fund name

2. optimize drilldown logic
2024-11-12 17:01:10 -06:00 · 2024-11-12 17:01:10 -06:00 · 7a41b03634
parent c2d2e54670
commit 7a41b03634
3 changed files with 35 additions and 12 deletions
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -27,9 +27,14 @@
 			"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", 
 			"b. The sub-fund name may be as the first column or first row values in the table.",
 			"b.1 fund name example:",
-			"- context:",
+			"---- context:",
 			"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
-			"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
+			"---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
+			"c. The fund name should be the nearest fund for relevant share/ data point/ value.",
+			"c.1 fund name example:",
+			"---- context:",
+			"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26%  \n1.29%",
+			"---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
 			"- Only extract the latest data from context:",
 			"If with multiple data values in same row, please extract the latest.",
 			"- Reported names:",
--- a/main.py
+++ b/main.py
@ -1110,11 +1110,11 @@ if __name__ == "__main__":
                            "546046730",
                            "546919329"
                            ]
-    special_doc_id_list = ["479742284"]
+    special_doc_id_list = ["506326520"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = False
-    re_run_mapping_data = False
+    re_run_extract_data = True
+    re_run_mapping_data = True
    force_save_total_data = False
    calculate_metrics = False

--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -374,18 +374,17 @@ class PDFUtil:
        if highlight_value is not None and len(highlight_value.strip()) > 0:
            pure_highlight_value = highlight_value.strip()
            
+            highlight_value_search_text = None
            if len(pure_highlight_value.split()) == 1 and \
                (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
                highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
-                highlight_value_search = re.search(highlight_value_regex, page_text)
+                highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
            else:
                highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
-                highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
-                if highlight_value_search is None:
+                highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True)
+                if highlight_value_search_text is None:
                    highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
-                    highlight_value_search = re.search(highlight_value_regex, page_text)
-            if highlight_value_search is not None:
-                highlight_value_search_text = highlight_value_search.group()
+                    highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
                
        annotation_data = {"pdf_file": self.simple_pdf_file,
                          "page_index": page_index,
@ -433,6 +432,19 @@ class PDFUtil:
            annotation_data["matching_val_area"] = bbox_list
        return annotation_data
    
+    def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
+        if ignore_case:
+            highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE)
+        else:
+            highlight_value_search_iter = re.finditer(highlight_value_regex, page_text)
+        
+        highlight_value_search_text = None
+        for highlight_value_search in highlight_value_search_iter:
+            highlight_value_search_text = highlight_value_search.group().strip()
+            if highlight_value_search_text == raw_value:
+                return highlight_value_search_text
+        return highlight_value_search_text
+    
    def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
        if text is None or len(text) == 0:
            return text
@ -442,7 +454,13 @@ class PDFUtil:
                continue
            replace = r"\{0}".format(special_iter.group())
            if replace not in text:
+                special_iter_text = special_iter.group()
+                if special_iter_text == ")" and text.strip()[-1] == ")" and \
+                    text.strip().count(")") == 1:
+                    text = text.replace(")", r"\)")
+                else:
                    text = re.sub(replace, r"\\W", text)
+                
        text = re.sub(r"( ){2,}", " ", text)
        if match_special_char_after_space:
            text = text.replace(" ", r"\s*\W*")