optimize administration fees instructions
This commit is contained in:
parent
d4bc3aba4e
commit
d3be711859
|
|
@ -575,7 +575,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num != 16:
|
||||
# if page_num != 8:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
|
|||
|
|
@ -302,7 +302,11 @@ class FilterPages:
|
|||
for split in search_text_split:
|
||||
if split[0].islower():
|
||||
lower_word_count += 1
|
||||
if lower_word_count < lower_word_count_threshold:
|
||||
if self.doc_source == "emea_ar" and \
|
||||
lower_word_count > lower_word_count_threshold:
|
||||
is_valid = False
|
||||
break
|
||||
|
||||
if re.search(self.percentage_regex, search_text) is not None:
|
||||
is_valid = True
|
||||
break
|
||||
|
|
|
|||
|
|
@ -249,6 +249,13 @@
|
|||
"----Example 1 End----",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29}]}",
|
||||
"\n",
|
||||
"----Example 2 Start----",
|
||||
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)",
|
||||
"----Example 2 End----",
|
||||
"The administration fee is $1.00 per week plus 0.17% pa, so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17}]}",
|
||||
"\n",
|
||||
"Complex cases:",
|
||||
"A. Need to add multiple numbers together.",
|
||||
"----Example 1 Start----",
|
||||
|
|
|
|||
6
main.py
6
main.py
|
|
@ -1043,8 +1043,8 @@ def batch_run_documents(
|
|||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
)
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
|
|
@ -1521,7 +1521,7 @@ if __name__ == "__main__":
|
|||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
# special_doc_id_list: list = ["411062815"]
|
||||
# special_doc_id_list: list = ["412778803"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
|
|||
Loading…
Reference in New Issue