1. optimize instructions for fund name

2. optimize drilldown logic
2024-11-12 17:01:10 -06:00 · 2024-11-12 17:01:10 -06:00 · 7a41b03634
parent c2d2e54670
commit 7a41b03634
3 changed files with 35 additions and 12 deletions
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -27,9 +27,14 @@
 			"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", 
 			"b. The sub-fund name may be as the first column or first row values in the table.",
 			"b.1 fund name example:",
-			"- context:",
+			"---- context:",
 			"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
-			"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
+			"---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
 			"c. The fund name should be the nearest fund for relevant share/ data point/ value.",
 			"c.1 fund name example:",
 			"---- context:",
 			"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26%  \n1.29%",
 			"---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
 			"- Only extract the latest data from context:",
 			"If with multiple data values in same row, please extract the latest.",
 			"- Reported names:",
--- a/main.py
+++ b/main.py
@ -1110,11 +1110,11 @@ if __name__ == "__main__":
                            "546046730",
                            "546919329"
                            ]
-    special_doc_id_list = ["479742284"]
+    special_doc_id_list = ["506326520"]
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
-    re_run_extract_data = False
+    re_run_extract_data = True
-    re_run_mapping_data = False
+    re_run_mapping_data = True
    force_save_total_data = False
    calculate_metrics = False
--- a/utils/pdf_util.py
+++ b/utils/pdf_util.py
@ -374,18 +374,17 @@ class PDFUtil:
        if highlight_value is not None and len(highlight_value.strip()) > 0:
            pure_highlight_value = highlight_value.strip()
            highlight_value_search_text = None
            if len(pure_highlight_value.split()) == 1 and \
                (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
                highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
-                highlight_value_search = re.search(highlight_value_regex, page_text)
+                highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
            else:
                highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
-                highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
+                highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True)
-                if highlight_value_search is None:
+                if highlight_value_search_text is None:
                    highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
-                    highlight_value_search = re.search(highlight_value_regex, page_text)
+                    highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
            if highlight_value_search is not None:
                highlight_value_search_text = highlight_value_search.group()
        annotation_data = {"pdf_file": self.simple_pdf_file,
                          "page_index": page_index,
@ -433,6 +432,19 @@ class PDFUtil:
            annotation_data["matching_val_area"] = bbox_list
        return annotation_data
    def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
        if ignore_case:
            highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE)
        else:
            highlight_value_search_iter = re.finditer(highlight_value_regex, page_text)
        highlight_value_search_text = None
        for highlight_value_search in highlight_value_search_iter:
            highlight_value_search_text = highlight_value_search.group().strip()
            if highlight_value_search_text == raw_value:
                return highlight_value_search_text
        return highlight_value_search_text
    def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
        if text is None or len(text) == 0:
            return text
@ -442,7 +454,13 @@ class PDFUtil:
                continue
            replace = r"\{0}".format(special_iter.group())
            if replace not in text:
                special_iter_text = special_iter.group()
                if special_iter_text == ")" and text.strip()[-1] == ")" and \
                    text.strip().count(")") == 1:
                    text = text.replace(")", r"\)")
                else:
                    text = re.sub(replace, r"\\W", text)
        text = re.sub(r"( ){2,}", " ", text)
        if match_special_char_after_space:
            text = text.replace(" ", r"\s*\W*")