From 7a41b0363445a0fbee3bbddfdd4b1dacd1e81c7f Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 12 Nov 2024 17:01:10 -0600 Subject: [PATCH] 1. optimize instructions for fund name 2. optimize drilldown logic --- .../data_extraction_prompts_config.json | 9 ++++-- main.py | 6 ++-- utils/pdf_util.py | 32 +++++++++++++++---- 3 files changed, 35 insertions(+), 12 deletions(-) diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 5a09b09..f1304e5 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -27,9 +27,14 @@ "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", "b. The sub-fund name may be as the first column or first row values in the table.", "b.1 fund name example:", - "- context:", + "---- context:", "Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)", - "fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)", + "---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)", + "c. The fund name should be the nearest fund for relevant share/ data point/ value.", + "c.1 fund name example:", + "---- context:", + "AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26% \n1.29%", + "---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon", "- Only extract the latest data from context:", "If with multiple data values in same row, please extract the latest.", "- Reported names:", diff --git a/main.py b/main.py index 55ee1fd..176120b 100644 --- a/main.py +++ b/main.py @@ -1110,11 +1110,11 @@ if __name__ == "__main__": "546046730", "546919329" ] - special_doc_id_list = ["479742284"] + special_doc_id_list = ["506326520"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = False calculate_metrics = False diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 3040a87..bdadff2 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -374,18 +374,17 @@ class PDFUtil: if highlight_value is not None and len(highlight_value.strip()) > 0: pure_highlight_value = highlight_value.strip() + highlight_value_search_text = None if len(pure_highlight_value.split()) == 1 and \ (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]): highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value) - highlight_value_search = re.search(highlight_value_regex, page_text) + highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False) else: highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False) - highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE) - if highlight_value_search is None: + highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True) + if highlight_value_search_text is None: highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True) - highlight_value_search = re.search(highlight_value_regex, page_text) - if highlight_value_search is not None: - highlight_value_search_text = highlight_value_search.group() + highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False) annotation_data = {"pdf_file": self.simple_pdf_file, "page_index": page_index, @@ -433,6 +432,19 @@ class PDFUtil: annotation_data["matching_val_area"] = bbox_list return annotation_data + def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True): + if ignore_case: + highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE) + else: + highlight_value_search_iter = re.finditer(highlight_value_regex, page_text) + + highlight_value_search_text = None + for highlight_value_search in highlight_value_search_iter: + highlight_value_search_text = highlight_value_search.group().strip() + if highlight_value_search_text == raw_value: + return highlight_value_search_text + return highlight_value_search_text + def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True): if text is None or len(text) == 0: return text @@ -442,7 +454,13 @@ class PDFUtil: continue replace = r"\{0}".format(special_iter.group()) if replace not in text: - text = re.sub(replace, r"\\W", text) + special_iter_text = special_iter.group() + if special_iter_text == ")" and text.strip()[-1] == ")" and \ + text.strip().count(")") == 1: + text = text.replace(")", r"\)") + else: + text = re.sub(replace, r"\\W", text) + text = re.sub(r"( ){2,}", " ", text) if match_special_char_after_space: text = text.replace(" ", r"\s*\W*")