From f71e2968cca27aebf338e8270497c6376e665145 Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 9 Dec 2024 22:24:40 -0600 Subject: [PATCH] simplify code --- core/data_extraction.py | 2 - main.py | 358 ++---------------- sample_documents/big_document.txt | 34 ++ .../document_with_all_4_datapoints.txt | 30 ++ .../emea_case_from_word_complex.txt | 23 ++ ...ish_document_with_ground_truth_regular.txt | 46 +++ .../new_sample_document_regular.txt | 29 ++ sample_documents/sample_document_complex.txt | 31 ++ 8 files changed, 226 insertions(+), 327 deletions(-) create mode 100644 sample_documents/big_document.txt create mode 100644 sample_documents/document_with_all_4_datapoints.txt create mode 100644 sample_documents/emea_case_from_word_complex.txt create mode 100644 sample_documents/english_document_with_ground_truth_regular.txt create mode 100644 sample_documents/new_sample_document_regular.txt create mode 100644 sample_documents/sample_document_complex.txt diff --git a/core/data_extraction.py b/core/data_extraction.py index 7e326e0..6a992df 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -171,8 +171,6 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - if page_num > 160: - break if page_num in handled_page_num_list: continue page_datapoints = self.get_datapoints_by_page_num(page_num) diff --git a/main.py b/main.py index 4065458..7fd97fd 100644 --- a/main.py +++ b/main.py @@ -869,329 +869,10 @@ def replace_rerun_data(new_data_file: str, original_data_file: str): new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet) -def batch_run_documents(): - # special_doc_id_list = ["505174428", "510326848", "349679479"] - # check_mapping_doc_id_list = [ - # "327956364", - # "391456740", - # "391736837", - # "458359181", - # "486383912", - # "497497599", - # "529925114", - # "321733631", - # "334718372", - # "344636875", - # "362246081", - # "445256897", - # "449623976", - # "458291624", - # "478585901", - # "492121213", - # "502821436", - # "507967525", - # "481475385", - # "508854243", - # "520879048", - # "402181770", - # "463081566", - # "502693599", - # "509845549", - # "389171486", - # "323390570", - # "366179419", - # "486378555", - # "506559375", - # "479793787", - # "471641628", - # ] - # English documents with ground truth - # check_db_mapping_doc_id_list = [ - # "292989214", - # "316237292", - # "321733631", - # "323390570", - # "327956364", - # "332223498", - # "333207452", - # "334718372", - # "344636875", - # "362246081", - # "366179419", - # "380945052", - # "382366116", - # "387202452", - # "389171486", - # "391456740", - # "391736837", - # "394778487", - # "401684600", - # "402113224", - # "402181770", - # "402397014", - # "405803396", - # "445102363", - # "445256897", - # "448265376", - # "449555622", - # "449623976", - # "458291624", - # "458359181", - # "463081566", - # "469138353", - # "471641628", - # "476492237", - # "478585901", - # "478586066", - # "479042264", - # "479042269", - # "479793787", - # "481475385", - # "483617247", - # "486378555", - # "486383912", - # "492121213", - # "497497599", - # "502693599" - # ] +def batch_run_documents(): + sample_document_list_folder = r'./sample_documents/' + document_list_files = glob(sample_document_list_folder + "*.txt") - # Documents in EMEA Case 1.docx - # check_db_mapping_doc_id_list = [ - # "424976833", - # "425480144", - # "427637151", - # "429564034", - # "429950833", - # "430240853", - # "431073795", - # "434710819", - # "434851173", - # "434902020", - # "434924914", - # "435128656", - # "440029306", - # "466371135", - # "466528487", - # "466859621", - # "466860852", - # "467595142", - # "467788879", - # "470515549" - # ] - - # documents in New EMEA Documents sample.xlsx as typical documents - # """ - # Below 9 documents can't get data by keywords or ChatGPT - # 526747539, - # 534112077, - # 535798742, - # 536299372, - # 539566148, - # 541343431, - # 541923319, - # 543243585, - # 543243654 - # """ - # check_db_mapping_doc_id_list = [ - # "511052670", - # "520733219", - # "524306810", - # "526747539", - # "528783089", - # "532422720", - # "532438210", - # "534112077", - # "534538571", - # "534538682", - # "535798742", - # "536299372", - # "539566148", - # "539604165", - # "540056900", - # "541343431", - # "541669780", - # "541669996", - # "541670397", - # "541923319", - # "542335994", - # "543243585", - # "543243654", - # "543244170", - # "543519140", - # "543519615", - # "543628379", - # "543809340", - # "543944737" - # ] - - # documents in Final list of EMEA documents.xlsx as typical documents - # check_db_mapping_doc_id_list = [ - # "532500349", - # "535324239", - # "532442891", - # "543243650", - # "528588598", - # "532437639", - # "527525440", - # "534987291", - # "534112055", - # "533482585", - # "544208174", - # "534547266", - # "544713166", - # "526463547", - # "534535569", - # "534106067", - # "532486560", - # "532781760", - # "533727067", - # "527256381", - # "533392425", - # "532179676", - # "534300608", - # "539233950", - # # "533727908", - # "532438414", - # "533681744", - # "537654645", - # "533594905", - # "537926443", - # "533499655", - # "533862814", - # "544918611", - # "539087870", - # "536343790" - # ] - - # document samples 2024-11-06 - # check_db_mapping_doc_id_list = ["546483469", - # "546375582", - # "546375575", - # "546375576", - # "546375577", - # "546375568", - # "546371033", - # "546632761", - # "546632544", - # "546632464", - # "546724583", - # "546724552", - # "546694677", - # "546660422", - # "546638908", - # "546632845", - # "546105299", - # "546085481", - # "546078693", - # "546078650", - # "546289930", - # "546289910", - # "542967371", - # "542798238", - # "546048730", - # "546048143", - # "546047619", - # "546047528", - # "546046730", - # "546919329"] - - # document samples: 30 documents, all with 4 data points - # check_db_mapping_doc_id_list = ["479742284", - # "501380497", - # "501380553", - # "501380775", - # "501380801", - # "501600428", - # "501600429", - # "501600541", - # "501600549", - # "503659548", - # "506326520", - # "507720522", - # "507928179", - # "508981020", - # "509133771", - # "509743502", - # "514636951", - # "514636952", - # "514636953", - # "514636954", - # "514636955", - # "514636957", - # "514636958", - # "514636959", - # "514636985", - # "514636988", - # "514636990", - # "514636993", - # "514636994", - # "539794746", - # ] - # Sample documents with special cases - check_db_mapping_doc_id_list = [ - "334584772", - "406913630", - "407275419", - "337937633", - "337293427", - "404712928", - "451063582", - "451878128", - "425595958", - "536344026", - "532422548", - "423418540", - "423418395", - "532998065", - "540307575", - "423395975", - "508704368", - "481482392", - "466580448", - "423365707", - "423364758", - "422761666", - "422760156", - "422760148", - "422686965", - "492029971", - "510300817", - "512745032", - "514213638", - "527525440", - "534535767" - ] - # total_data_prefix = "complex_doc_" - # documents from EMEA Case 1.docx - check_db_mapping_doc_id_list = [ - "510483464", - "525412280", - "528208797", - "435128656", - "425480144", - "466528487", - "434902020", - "440029306", - "431073795", - "430240853", - "427637151", - "434924914", - "467595142", - "466859621", - "429564034", - "424976833", - "466860852", - "466371135", - "470515549", - "434851173", - "434710819", - "429950833", - "467788879" - ] - total_data_prefix = "complex_doc_from_word" - special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["334584772", "435128656"] - special_doc_id_list = ["514213638"] pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" @@ -1205,8 +886,36 @@ def batch_run_documents(): force_save_total_data = False calculate_metrics = False - extract_ways = ["text"] - for extract_way in extract_ways: + extract_way = "text" + special_doc_id_list = [] + if len(special_doc_id_list) == 0: + force_save_total_data = True + file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"] + for document_list_file in document_list_files: + file_base_name = os.path.basename(document_list_file).replace(".txt", "") + if (file_base_name_candidates is not None and + len(file_base_name_candidates) > 0 and + file_base_name not in file_base_name_candidates): + continue + with open(document_list_file, "r", encoding="utf-8") as f: + doc_id_list = f.readlines() + doc_id_list = [doc_id.strip() for doc_id in doc_id_list] + batch_start_job( + pdf_folder, + page_filter_ground_truth_file, + output_extract_data_child_folder, + output_mapping_child_folder, + output_extract_data_total_folder, + output_mapping_total_folder, + extract_way, + doc_id_list, + re_run_extract_data, + re_run_mapping_data, + force_save_total_data=force_save_total_data, + calculate_metrics=calculate_metrics, + total_data_prefix=file_base_name + ) + else: batch_start_job( pdf_folder, page_filter_ground_truth_file, @@ -1220,7 +929,6 @@ def batch_run_documents(): re_run_mapping_data, force_save_total_data=force_save_total_data, calculate_metrics=calculate_metrics, - total_data_prefix=total_data_prefix ) diff --git a/sample_documents/big_document.txt b/sample_documents/big_document.txt new file mode 100644 index 0000000..57fd84b --- /dev/null +++ b/sample_documents/big_document.txt @@ -0,0 +1,34 @@ +532500349 +535324239 +532442891 +543243650 +528588598 +532437639 +527525440 +534987291 +534112055 +533482585 +544208174 +534547266 +544713166 +526463547 +534535569 +534106067 +532486560 +532781760 +533727067 +527256381 +533392425 +532179676 +534300608 +539233950 +532438414 +533681744 +537654645 +533594905 +537926443 +533499655 +533862814 +544918611 +539087870 +536343790 \ No newline at end of file diff --git a/sample_documents/document_with_all_4_datapoints.txt b/sample_documents/document_with_all_4_datapoints.txt new file mode 100644 index 0000000..3ad8061 --- /dev/null +++ b/sample_documents/document_with_all_4_datapoints.txt @@ -0,0 +1,30 @@ +479742284 +501380497 +501380553 +501380775 +501380801 +501600428 +501600429 +501600541 +501600549 +503659548 +506326520 +507720522 +507928179 +508981020 +509133771 +509743502 +514636951 +514636952 +514636953 +514636954 +514636955 +514636957 +514636958 +514636959 +514636985 +514636988 +514636990 +514636993 +514636994 +539794746 \ No newline at end of file diff --git a/sample_documents/emea_case_from_word_complex.txt b/sample_documents/emea_case_from_word_complex.txt new file mode 100644 index 0000000..7af6fa8 --- /dev/null +++ b/sample_documents/emea_case_from_word_complex.txt @@ -0,0 +1,23 @@ +510483464 +525412280 +528208797 +435128656 +425480144 +466528487 +434902020 +440029306 +431073795 +430240853 +427637151 +434924914 +467595142 +466859621 +429564034 +424976833 +466860852 +466371135 +470515549 +434851173 +434710819 +429950833 +467788879 \ No newline at end of file diff --git a/sample_documents/english_document_with_ground_truth_regular.txt b/sample_documents/english_document_with_ground_truth_regular.txt new file mode 100644 index 0000000..62f7ff6 --- /dev/null +++ b/sample_documents/english_document_with_ground_truth_regular.txt @@ -0,0 +1,46 @@ +292989214 +316237292 +321733631 +323390570 +327956364 +332223498 +333207452 +334718372 +344636875 +362246081 +366179419 +380945052 +382366116 +387202452 +389171486 +391456740 +391736837 +394778487 +401684600 +402113224 +402181770 +402397014 +405803396 +445102363 +445256897 +448265376 +449555622 +449623976 +458291624 +458359181 +463081566 +469138353 +471641628 +476492237 +478585901 +478586066 +479042264 +479042269 +479793787 +481475385 +483617247 +486378555 +486383912 +492121213 +497497599 +502693599 \ No newline at end of file diff --git a/sample_documents/new_sample_document_regular.txt b/sample_documents/new_sample_document_regular.txt new file mode 100644 index 0000000..cc4f11d --- /dev/null +++ b/sample_documents/new_sample_document_regular.txt @@ -0,0 +1,29 @@ +511052670 +520733219 +524306810 +526747539 +528783089 +532422720 +532438210 +534112077 +534538571 +534538682 +535798742 +536299372 +539566148 +539604165 +540056900 +541343431 +541669780 +541669996 +541670397 +541923319 +542335994 +543243585 +543243654 +543244170 +543519140 +543519615 +543628379 +543809340 +543944737 \ No newline at end of file diff --git a/sample_documents/sample_document_complex.txt b/sample_documents/sample_document_complex.txt new file mode 100644 index 0000000..b88a8be --- /dev/null +++ b/sample_documents/sample_document_complex.txt @@ -0,0 +1,31 @@ +334584772 +406913630 +407275419 +337937633 +337293427 +404712928 +451063582 +451878128 +425595958 +536344026 +532422548 +423418540 +423418395 +532998065 +540307575 +423395975 +508704368 +481482392 +466580448 +423365707 +423364758 +422761666 +422760156 +422760148 +422686965 +492029971 +510300817 +512745032 +514213638 +527525440 +534535767 \ No newline at end of file