1. optimize instructions for fund name
2. optimize drilldown logic
This commit is contained in:
parent
c2d2e54670
commit
7a41b03634
|
|
@ -27,9 +27,14 @@
|
||||||
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
|
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
|
||||||
"b. The sub-fund name may be as the first column or first row values in the table.",
|
"b. The sub-fund name may be as the first column or first row values in the table.",
|
||||||
"b.1 fund name example:",
|
"b.1 fund name example:",
|
||||||
"- context:",
|
"---- context:",
|
||||||
"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
|
"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
|
||||||
"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
|
"---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
|
||||||
|
"c. The fund name should be the nearest fund for relevant share/ data point/ value.",
|
||||||
|
"c.1 fund name example:",
|
||||||
|
"---- context:",
|
||||||
|
"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26% \n1.29%",
|
||||||
|
"---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
|
||||||
"- Only extract the latest data from context:",
|
"- Only extract the latest data from context:",
|
||||||
"If with multiple data values in same row, please extract the latest.",
|
"If with multiple data values in same row, please extract the latest.",
|
||||||
"- Reported names:",
|
"- Reported names:",
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -1110,11 +1110,11 @@ if __name__ == "__main__":
|
||||||
"546046730",
|
"546046730",
|
||||||
"546919329"
|
"546919329"
|
||||||
]
|
]
|
||||||
special_doc_id_list = ["479742284"]
|
special_doc_id_list = ["506326520"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -374,18 +374,17 @@ class PDFUtil:
|
||||||
if highlight_value is not None and len(highlight_value.strip()) > 0:
|
if highlight_value is not None and len(highlight_value.strip()) > 0:
|
||||||
pure_highlight_value = highlight_value.strip()
|
pure_highlight_value = highlight_value.strip()
|
||||||
|
|
||||||
|
highlight_value_search_text = None
|
||||||
if len(pure_highlight_value.split()) == 1 and \
|
if len(pure_highlight_value.split()) == 1 and \
|
||||||
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
|
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
|
||||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
|
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
|
||||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
|
||||||
else:
|
else:
|
||||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
|
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
|
||||||
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
|
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True)
|
||||||
if highlight_value_search is None:
|
if highlight_value_search_text is None:
|
||||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
|
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
|
||||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
|
||||||
if highlight_value_search is not None:
|
|
||||||
highlight_value_search_text = highlight_value_search.group()
|
|
||||||
|
|
||||||
annotation_data = {"pdf_file": self.simple_pdf_file,
|
annotation_data = {"pdf_file": self.simple_pdf_file,
|
||||||
"page_index": page_index,
|
"page_index": page_index,
|
||||||
|
|
@ -433,6 +432,19 @@ class PDFUtil:
|
||||||
annotation_data["matching_val_area"] = bbox_list
|
annotation_data["matching_val_area"] = bbox_list
|
||||||
return annotation_data
|
return annotation_data
|
||||||
|
|
||||||
|
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
|
||||||
|
if ignore_case:
|
||||||
|
highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE)
|
||||||
|
else:
|
||||||
|
highlight_value_search_iter = re.finditer(highlight_value_regex, page_text)
|
||||||
|
|
||||||
|
highlight_value_search_text = None
|
||||||
|
for highlight_value_search in highlight_value_search_iter:
|
||||||
|
highlight_value_search_text = highlight_value_search.group().strip()
|
||||||
|
if highlight_value_search_text == raw_value:
|
||||||
|
return highlight_value_search_text
|
||||||
|
return highlight_value_search_text
|
||||||
|
|
||||||
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
|
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
|
||||||
if text is None or len(text) == 0:
|
if text is None or len(text) == 0:
|
||||||
return text
|
return text
|
||||||
|
|
@ -442,7 +454,13 @@ class PDFUtil:
|
||||||
continue
|
continue
|
||||||
replace = r"\{0}".format(special_iter.group())
|
replace = r"\{0}".format(special_iter.group())
|
||||||
if replace not in text:
|
if replace not in text:
|
||||||
|
special_iter_text = special_iter.group()
|
||||||
|
if special_iter_text == ")" and text.strip()[-1] == ")" and \
|
||||||
|
text.strip().count(")") == 1:
|
||||||
|
text = text.replace(")", r"\)")
|
||||||
|
else:
|
||||||
text = re.sub(replace, r"\\W", text)
|
text = re.sub(replace, r"\\W", text)
|
||||||
|
|
||||||
text = re.sub(r"( ){2,}", " ", text)
|
text = re.sub(r"( ){2,}", " ", text)
|
||||||
if match_special_char_after_space:
|
if match_special_char_after_space:
|
||||||
text = text.replace(" ", r"\s*\W*")
|
text = text.replace(" ", r"\s*\W*")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue