optimize administration fees instructions
This commit is contained in:
parent
d4bc3aba4e
commit
d3be711859
|
|
@ -575,7 +575,7 @@ class DataExtraction:
|
||||||
previous_page_datapoints = []
|
previous_page_datapoints = []
|
||||||
previous_page_fund_name = None
|
previous_page_fund_name = None
|
||||||
for page_num, page_text in self.page_text_dict.items():
|
for page_num, page_text in self.page_text_dict.items():
|
||||||
# if page_num != 16:
|
# if page_num != 8:
|
||||||
# continue
|
# continue
|
||||||
if page_num in handled_page_num_list:
|
if page_num in handled_page_num_list:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -302,28 +302,32 @@ class FilterPages:
|
||||||
for split in search_text_split:
|
for split in search_text_split:
|
||||||
if split[0].islower():
|
if split[0].islower():
|
||||||
lower_word_count += 1
|
lower_word_count += 1
|
||||||
if lower_word_count < lower_word_count_threshold:
|
if self.doc_source == "emea_ar" and \
|
||||||
if re.search(self.percentage_regex, search_text) is not None:
|
lower_word_count > lower_word_count_threshold:
|
||||||
is_valid = True
|
is_valid = False
|
||||||
break
|
break
|
||||||
new_search_text_regex = add_slash_to_text_as_regex(search_text)
|
|
||||||
new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
|
if re.search(self.percentage_regex, search_text) is not None:
|
||||||
new_search_text_regex
|
is_valid = True
|
||||||
)
|
break
|
||||||
new_search = re.search(new_search_regex, text, re.IGNORECASE)
|
new_search_text_regex = add_slash_to_text_as_regex(search_text)
|
||||||
if new_search is not None:
|
new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
|
||||||
next_line = new_search.group("next_line").strip()
|
new_search_text_regex
|
||||||
next_2_line = new_search.group("next_2_line").strip()
|
)
|
||||||
|
new_search = re.search(new_search_regex, text, re.IGNORECASE)
|
||||||
if re.search(big_number_regex, next_line) is not None or \
|
if new_search is not None:
|
||||||
re.search(big_number_regex, next_2_line) is not None:
|
next_line = new_search.group("next_line").strip()
|
||||||
is_valid = False
|
next_2_line = new_search.group("next_2_line").strip()
|
||||||
else:
|
|
||||||
is_valid = True
|
if re.search(big_number_regex, next_line) is not None or \
|
||||||
break
|
re.search(big_number_regex, next_2_line) is not None:
|
||||||
|
is_valid = False
|
||||||
else:
|
else:
|
||||||
is_valid = True
|
is_valid = True
|
||||||
break
|
break
|
||||||
|
else:
|
||||||
|
is_valid = True
|
||||||
|
break
|
||||||
return is_valid
|
return is_valid
|
||||||
|
|
||||||
def search_keyword(self, text: str, keyword: str):
|
def search_keyword(self, text: str, keyword: str):
|
||||||
|
|
|
||||||
|
|
@ -249,6 +249,13 @@
|
||||||
"----Example 1 End----",
|
"----Example 1 End----",
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
|
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
|
||||||
|
"\n",
|
||||||
|
"----Example 2 Start----",
|
||||||
|
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)",
|
||||||
|
"----Example 2 End----",
|
||||||
|
"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
|
||||||
|
"\n",
|
||||||
"Complex cases:",
|
"Complex cases:",
|
||||||
"A. Need to add multiple numbers together.",
|
"A. Need to add multiple numbers together.",
|
||||||
"----Example 1 Start----",
|
"----Example 1 Start----",
|
||||||
|
|
|
||||||
6
main.py
6
main.py
|
|
@ -1043,8 +1043,8 @@ def batch_run_documents(
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
)
|
)
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = False
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
|
|
@ -1521,7 +1521,7 @@ if __name__ == "__main__":
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||||
# special_doc_id_list: list = ["411062815"]
|
# special_doc_id_list: list = ["412778803"]
|
||||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||||
output_extract_data_child_folder: str = (
|
output_extract_data_child_folder: str = (
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue