diff --git a/core/data_extraction.py b/core/data_extraction.py index 258f0a9..989c1e9 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -786,6 +786,11 @@ class DataExtraction: share_name = management_fee_data.get("share_name", "") if fund_name == "" or share_name == "": continue + remain_keys = [key for key in keys if key not in ["fund_name", "share_name", + "management_fee_and_costs", + "management_fee"]] + if len(remain_keys) > 0: + continue if "management_fee_and_costs" in keys: management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1) try: diff --git a/main.py b/main.py index 6396ae0..c5438a5 100644 --- a/main.py +++ b/main.py @@ -1531,18 +1531,18 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" # ) - # document_sample_file = ( - # r"./sample_documents/aus_prospectus_46_documents_sample.txt" - # ) document_sample_file = ( - r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt" + r"./sample_documents/aus_prospectus_46_documents_sample.txt" ) + # document_sample_file = ( + # r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt" + # ) logger.info(f"Start to run document sample file: {document_sample_file}") with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() if len(doc_id.strip()) > 0] # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"] - # special_doc_id_list = ["539999907", "455235248", "448576924"] + # special_doc_id_list = ["462780211", "539999907"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = (