optimize administration fees instructions

This commit is contained in:
Blade He 2025-02-28 22:12:18 -06:00
parent d4bc3aba4e
commit d3be711859
4 changed files with 33 additions and 22 deletions

View File

@ -575,7 +575,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num != 16: # if page_num != 8:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue

View File

@ -302,7 +302,11 @@ class FilterPages:
for split in search_text_split: for split in search_text_split:
if split[0].islower(): if split[0].islower():
lower_word_count += 1 lower_word_count += 1
if lower_word_count < lower_word_count_threshold: if self.doc_source == "emea_ar" and \
lower_word_count > lower_word_count_threshold:
is_valid = False
break
if re.search(self.percentage_regex, search_text) is not None: if re.search(self.percentage_regex, search_text) is not None:
is_valid = True is_valid = True
break break

View File

@ -249,6 +249,13 @@
"----Example 1 End----", "----Example 1 End----",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}", "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
"\n",
"----Example 2 Start----",
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply)",
"----Example 2 End----",
"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
"\n",
"Complex cases:", "Complex cases:",
"A. Need to add multiple numbers together.", "A. Need to add multiple numbers together.",
"----Example 1 Start----", "----Example 1 Start----",

View File

@ -1043,8 +1043,8 @@ def batch_run_documents(
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = False
force_save_total_data = True force_save_total_data = True
calculate_metrics = False calculate_metrics = False
@ -1521,7 +1521,7 @@ if __name__ == "__main__":
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
# special_doc_id_list: list = ["411062815"] # special_doc_id_list: list = ["412778803"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (