1. optimize instructions for fund name

2. optimize drilldown logic
This commit is contained in:
Blade He 2024-11-12 17:01:10 -06:00
parent c2d2e54670
commit 7a41b03634
3 changed files with 35 additions and 12 deletions

View File

@ -27,9 +27,14 @@
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
"b. The sub-fund name may be as the first column or first row values in the table.", "b. The sub-fund name may be as the first column or first row values in the table.",
"b.1 fund name example:", "b.1 fund name example:",
"- context:", "---- context:",
"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)", "Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)", "---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
"c. The fund name should be the nearest fund for relevant share/ data point/ value.",
"c.1 fund name example:",
"---- context:",
"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26% \n1.29%",
"---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
"- Only extract the latest data from context:", "- Only extract the latest data from context:",
"If with multiple data values in same row, please extract the latest.", "If with multiple data values in same row, please extract the latest.",
"- Reported names:", "- Reported names:",

View File

@ -1110,11 +1110,11 @@ if __name__ == "__main__":
"546046730", "546046730",
"546919329" "546919329"
] ]
special_doc_id_list = ["479742284"] special_doc_id_list = ["506326520"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False re_run_extract_data = True
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = False force_save_total_data = False
calculate_metrics = False calculate_metrics = False

View File

@ -374,18 +374,17 @@ class PDFUtil:
if highlight_value is not None and len(highlight_value.strip()) > 0: if highlight_value is not None and len(highlight_value.strip()) > 0:
pure_highlight_value = highlight_value.strip() pure_highlight_value = highlight_value.strip()
highlight_value_search_text = None
if len(pure_highlight_value.split()) == 1 and \ if len(pure_highlight_value.split()) == 1 and \
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]): (len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value) highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
highlight_value_search = re.search(highlight_value_regex, page_text) highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
else: else:
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False) highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE) highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True)
if highlight_value_search is None: if highlight_value_search_text is None:
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True) highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
highlight_value_search = re.search(highlight_value_regex, page_text) highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
if highlight_value_search is not None:
highlight_value_search_text = highlight_value_search.group()
annotation_data = {"pdf_file": self.simple_pdf_file, annotation_data = {"pdf_file": self.simple_pdf_file,
"page_index": page_index, "page_index": page_index,
@ -433,6 +432,19 @@ class PDFUtil:
annotation_data["matching_val_area"] = bbox_list annotation_data["matching_val_area"] = bbox_list
return annotation_data return annotation_data
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
if ignore_case:
highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE)
else:
highlight_value_search_iter = re.finditer(highlight_value_regex, page_text)
highlight_value_search_text = None
for highlight_value_search in highlight_value_search_iter:
highlight_value_search_text = highlight_value_search.group().strip()
if highlight_value_search_text == raw_value:
return highlight_value_search_text
return highlight_value_search_text
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True): def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
if text is None or len(text) == 0: if text is None or len(text) == 0:
return text return text
@ -442,7 +454,13 @@ class PDFUtil:
continue continue
replace = r"\{0}".format(special_iter.group()) replace = r"\{0}".format(special_iter.group())
if replace not in text: if replace not in text:
special_iter_text = special_iter.group()
if special_iter_text == ")" and text.strip()[-1] == ")" and \
text.strip().count(")") == 1:
text = text.replace(")", r"\)")
else:
text = re.sub(replace, r"\\W", text) text = re.sub(replace, r"\\W", text)
text = re.sub(r"( ){2,}", " ", text) text = re.sub(r"( ){2,}", " ", text)
if match_special_char_after_space: if match_special_char_after_space:
text = text.replace(" ", r"\s*\W*") text = text.replace(" ", r"\s*\W*")