1. optimize instructions for fund name
2. optimize drilldown logic
This commit is contained in:
parent
c2d2e54670
commit
7a41b03634
|
|
@ -27,9 +27,14 @@
|
|||
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
|
||||
"b. The sub-fund name may be as the first column or first row values in the table.",
|
||||
"b.1 fund name example:",
|
||||
"- context:",
|
||||
"---- context:",
|
||||
"Summary information\nCapital International Fund Audited Annual Report 2023 | 15\nFootnotes are on page 17.\nCapital Group Multi-Sector \nIncome Fund (LUX) \n(CGMSILU)\nCapital Group US High Yield \nFund (LUX) (CGUSHYLU)\nCapital Group Emerging \nMarkets Debt Fund (LUX) \n(CGEMDLU)",
|
||||
"fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
|
||||
"---- fund names: Capital International Group Multi-Sector Income Fund (LUX), Capital International Group US High Yield Fund (LUX), Capital International Group Emerging Markets Debt Fund (LUX)",
|
||||
"c. The fund name should be the nearest fund for relevant share/ data point/ value.",
|
||||
"c.1 fund name example:",
|
||||
"---- context:",
|
||||
"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26% \n1.29%",
|
||||
"---- correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
|
||||
"- Only extract the latest data from context:",
|
||||
"If with multiple data values in same row, please extract the latest.",
|
||||
"- Reported names:",
|
||||
|
|
|
|||
6
main.py
6
main.py
|
|
@ -1110,11 +1110,11 @@ if __name__ == "__main__":
|
|||
"546046730",
|
||||
"546919329"
|
||||
]
|
||||
special_doc_id_list = ["479742284"]
|
||||
special_doc_id_list = ["506326520"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
||||
|
|
|
|||
|
|
@ -374,18 +374,17 @@ class PDFUtil:
|
|||
if highlight_value is not None and len(highlight_value.strip()) > 0:
|
||||
pure_highlight_value = highlight_value.strip()
|
||||
|
||||
highlight_value_search_text = None
|
||||
if len(pure_highlight_value.split()) == 1 and \
|
||||
(len(pure_highlight_value) < 3 or pure_highlight_value[0].upper() == pure_highlight_value[0]):
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value)
|
||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
|
||||
else:
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=False)
|
||||
highlight_value_search = re.search(highlight_value_regex, page_text, re.IGNORECASE)
|
||||
if highlight_value_search is None:
|
||||
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=True)
|
||||
if highlight_value_search_text is None:
|
||||
highlight_value_regex = self.add_slash_to_text_as_regex(pure_highlight_value, match_special_char_after_space=True)
|
||||
highlight_value_search = re.search(highlight_value_regex, page_text)
|
||||
if highlight_value_search is not None:
|
||||
highlight_value_search_text = highlight_value_search.group()
|
||||
highlight_value_search_text = self.get_proper_search_text(pure_highlight_value, highlight_value_regex, page_text, ignore_case=False)
|
||||
|
||||
annotation_data = {"pdf_file": self.simple_pdf_file,
|
||||
"page_index": page_index,
|
||||
|
|
@ -433,6 +432,19 @@ class PDFUtil:
|
|||
annotation_data["matching_val_area"] = bbox_list
|
||||
return annotation_data
|
||||
|
||||
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
|
||||
if ignore_case:
|
||||
highlight_value_search_iter = re.finditer(highlight_value_regex, page_text, re.IGNORECASE)
|
||||
else:
|
||||
highlight_value_search_iter = re.finditer(highlight_value_regex, page_text)
|
||||
|
||||
highlight_value_search_text = None
|
||||
for highlight_value_search in highlight_value_search_iter:
|
||||
highlight_value_search_text = highlight_value_search.group().strip()
|
||||
if highlight_value_search_text == raw_value:
|
||||
return highlight_value_search_text
|
||||
return highlight_value_search_text
|
||||
|
||||
def add_slash_to_text_as_regex(self, text: str, match_special_char_after_space: bool = True):
|
||||
if text is None or len(text) == 0:
|
||||
return text
|
||||
|
|
@ -442,7 +454,13 @@ class PDFUtil:
|
|||
continue
|
||||
replace = r"\{0}".format(special_iter.group())
|
||||
if replace not in text:
|
||||
special_iter_text = special_iter.group()
|
||||
if special_iter_text == ")" and text.strip()[-1] == ")" and \
|
||||
text.strip().count(")") == 1:
|
||||
text = text.replace(")", r"\)")
|
||||
else:
|
||||
text = re.sub(replace, r"\\W", text)
|
||||
|
||||
text = re.sub(r"( ){2,}", " ", text)
|
||||
if match_special_char_after_space:
|
||||
text = text.replace(" ", r"\s*\W*")
|
||||
|
|
|
|||
Loading…
Reference in New Issue