Confirm span pages calculation, the management fee and costs page only with management_fee_and_costs and management_fee datapoints

This commit is contained in:
Blade He 2025-04-03 18:08:27 -05:00
parent f333cc30f5
commit 37cf06a394
2 changed files with 10 additions and 5 deletions

View File

@ -786,6 +786,11 @@ class DataExtraction:
share_name = management_fee_data.get("share_name", "")
if fund_name == "" or share_name == "":
continue
remain_keys = [key for key in keys if key not in ["fund_name", "share_name",
"management_fee_and_costs",
"management_fee"]]
if len(remain_keys) > 0:
continue
if "management_fee_and_costs" in keys:
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
try:

10
main.py
View File

@ -1531,18 +1531,18 @@ if __name__ == "__main__":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
# )
# document_sample_file = (
# r"./sample_documents/aus_prospectus_46_documents_sample.txt"
# )
document_sample_file = (
r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
r"./sample_documents/aus_prospectus_46_documents_sample.txt"
)
# document_sample_file = (
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
# )
logger.info(f"Start to run document sample file: {document_sample_file}")
with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
if len(doc_id.strip()) > 0]
# special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
# special_doc_id_list = ["539999907", "455235248", "448576924"]
# special_doc_id_list = ["462780211", "539999907"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = (