simplify code

This commit is contained in:
Blade He 2024-12-09 22:24:40 -06:00
parent 75ea5e70de
commit f71e2968cc
8 changed files with 226 additions and 327 deletions

View File

@ -171,8 +171,6 @@ class DataExtraction:
previous_page_datapoints = []
previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items():
if page_num > 160:
break
if page_num in handled_page_num_list:
continue
page_datapoints = self.get_datapoints_by_page_num(page_num)

358
main.py
View File

@ -869,329 +869,10 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
def batch_run_documents():
# special_doc_id_list = ["505174428", "510326848", "349679479"]
# check_mapping_doc_id_list = [
# "327956364",
# "391456740",
# "391736837",
# "458359181",
# "486383912",
# "497497599",
# "529925114",
# "321733631",
# "334718372",
# "344636875",
# "362246081",
# "445256897",
# "449623976",
# "458291624",
# "478585901",
# "492121213",
# "502821436",
# "507967525",
# "481475385",
# "508854243",
# "520879048",
# "402181770",
# "463081566",
# "502693599",
# "509845549",
# "389171486",
# "323390570",
# "366179419",
# "486378555",
# "506559375",
# "479793787",
# "471641628",
# ]
# English documents with ground truth
# check_db_mapping_doc_id_list = [
# "292989214",
# "316237292",
# "321733631",
# "323390570",
# "327956364",
# "332223498",
# "333207452",
# "334718372",
# "344636875",
# "362246081",
# "366179419",
# "380945052",
# "382366116",
# "387202452",
# "389171486",
# "391456740",
# "391736837",
# "394778487",
# "401684600",
# "402113224",
# "402181770",
# "402397014",
# "405803396",
# "445102363",
# "445256897",
# "448265376",
# "449555622",
# "449623976",
# "458291624",
# "458359181",
# "463081566",
# "469138353",
# "471641628",
# "476492237",
# "478585901",
# "478586066",
# "479042264",
# "479042269",
# "479793787",
# "481475385",
# "483617247",
# "486378555",
# "486383912",
# "492121213",
# "497497599",
# "502693599"
# ]
def batch_run_documents():
sample_document_list_folder = r'./sample_documents/'
document_list_files = glob(sample_document_list_folder + "*.txt")
# Documents in EMEA Case 1.docx
# check_db_mapping_doc_id_list = [
# "424976833",
# "425480144",
# "427637151",
# "429564034",
# "429950833",
# "430240853",
# "431073795",
# "434710819",
# "434851173",
# "434902020",
# "434924914",
# "435128656",
# "440029306",
# "466371135",
# "466528487",
# "466859621",
# "466860852",
# "467595142",
# "467788879",
# "470515549"
# ]
# documents in New EMEA Documents sample.xlsx as typical documents
# """
# Below 9 documents can't get data by keywords or ChatGPT
# 526747539,
# 534112077,
# 535798742,
# 536299372,
# 539566148,
# 541343431,
# 541923319,
# 543243585,
# 543243654
# """
# check_db_mapping_doc_id_list = [
# "511052670",
# "520733219",
# "524306810",
# "526747539",
# "528783089",
# "532422720",
# "532438210",
# "534112077",
# "534538571",
# "534538682",
# "535798742",
# "536299372",
# "539566148",
# "539604165",
# "540056900",
# "541343431",
# "541669780",
# "541669996",
# "541670397",
# "541923319",
# "542335994",
# "543243585",
# "543243654",
# "543244170",
# "543519140",
# "543519615",
# "543628379",
# "543809340",
# "543944737"
# ]
# documents in Final list of EMEA documents.xlsx as typical documents
# check_db_mapping_doc_id_list = [
# "532500349",
# "535324239",
# "532442891",
# "543243650",
# "528588598",
# "532437639",
# "527525440",
# "534987291",
# "534112055",
# "533482585",
# "544208174",
# "534547266",
# "544713166",
# "526463547",
# "534535569",
# "534106067",
# "532486560",
# "532781760",
# "533727067",
# "527256381",
# "533392425",
# "532179676",
# "534300608",
# "539233950",
# # "533727908",
# "532438414",
# "533681744",
# "537654645",
# "533594905",
# "537926443",
# "533499655",
# "533862814",
# "544918611",
# "539087870",
# "536343790"
# ]
# document samples 2024-11-06
# check_db_mapping_doc_id_list = ["546483469",
# "546375582",
# "546375575",
# "546375576",
# "546375577",
# "546375568",
# "546371033",
# "546632761",
# "546632544",
# "546632464",
# "546724583",
# "546724552",
# "546694677",
# "546660422",
# "546638908",
# "546632845",
# "546105299",
# "546085481",
# "546078693",
# "546078650",
# "546289930",
# "546289910",
# "542967371",
# "542798238",
# "546048730",
# "546048143",
# "546047619",
# "546047528",
# "546046730",
# "546919329"]
# document samples: 30 documents, all with 4 data points
# check_db_mapping_doc_id_list = ["479742284",
# "501380497",
# "501380553",
# "501380775",
# "501380801",
# "501600428",
# "501600429",
# "501600541",
# "501600549",
# "503659548",
# "506326520",
# "507720522",
# "507928179",
# "508981020",
# "509133771",
# "509743502",
# "514636951",
# "514636952",
# "514636953",
# "514636954",
# "514636955",
# "514636957",
# "514636958",
# "514636959",
# "514636985",
# "514636988",
# "514636990",
# "514636993",
# "514636994",
# "539794746",
# ]
# Sample documents with special cases
check_db_mapping_doc_id_list = [
"334584772",
"406913630",
"407275419",
"337937633",
"337293427",
"404712928",
"451063582",
"451878128",
"425595958",
"536344026",
"532422548",
"423418540",
"423418395",
"532998065",
"540307575",
"423395975",
"508704368",
"481482392",
"466580448",
"423365707",
"423364758",
"422761666",
"422760156",
"422760148",
"422686965",
"492029971",
"510300817",
"512745032",
"514213638",
"527525440",
"534535767"
]
# total_data_prefix = "complex_doc_"
# documents from EMEA Case 1.docx
check_db_mapping_doc_id_list = [
"510483464",
"525412280",
"528208797",
"435128656",
"425480144",
"466528487",
"434902020",
"440029306",
"431073795",
"430240853",
"427637151",
"434924914",
"467595142",
"466859621",
"429564034",
"424976833",
"466860852",
"466371135",
"470515549",
"434851173",
"434710819",
"429950833",
"467788879"
]
total_data_prefix = "complex_doc_from_word"
special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["334584772", "435128656"]
special_doc_id_list = ["514213638"]
pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1205,8 +886,36 @@ def batch_run_documents():
force_save_total_data = False
calculate_metrics = False
extract_ways = ["text"]
for extract_way in extract_ways:
extract_way = "text"
special_doc_id_list = []
if len(special_doc_id_list) == 0:
force_save_total_data = True
file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
for document_list_file in document_list_files:
file_base_name = os.path.basename(document_list_file).replace(".txt", "")
if (file_base_name_candidates is not None and
len(file_base_name_candidates) > 0 and
file_base_name not in file_base_name_candidates):
continue
with open(document_list_file, "r", encoding="utf-8") as f:
doc_id_list = f.readlines()
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
batch_start_job(
pdf_folder,
page_filter_ground_truth_file,
output_extract_data_child_folder,
output_mapping_child_folder,
output_extract_data_total_folder,
output_mapping_total_folder,
extract_way,
doc_id_list,
re_run_extract_data,
re_run_mapping_data,
force_save_total_data=force_save_total_data,
calculate_metrics=calculate_metrics,
total_data_prefix=file_base_name
)
else:
batch_start_job(
pdf_folder,
page_filter_ground_truth_file,
@ -1220,7 +929,6 @@ def batch_run_documents():
re_run_mapping_data,
force_save_total_data=force_save_total_data,
calculate_metrics=calculate_metrics,
total_data_prefix=total_data_prefix
)

View File

@ -0,0 +1,34 @@
532500349
535324239
532442891
543243650
528588598
532437639
527525440
534987291
534112055
533482585
544208174
534547266
544713166
526463547
534535569
534106067
532486560
532781760
533727067
527256381
533392425
532179676
534300608
539233950
532438414
533681744
537654645
533594905
537926443
533499655
533862814
544918611
539087870
536343790

View File

@ -0,0 +1,30 @@
479742284
501380497
501380553
501380775
501380801
501600428
501600429
501600541
501600549
503659548
506326520
507720522
507928179
508981020
509133771
509743502
514636951
514636952
514636953
514636954
514636955
514636957
514636958
514636959
514636985
514636988
514636990
514636993
514636994
539794746

View File

@ -0,0 +1,23 @@
510483464
525412280
528208797
435128656
425480144
466528487
434902020
440029306
431073795
430240853
427637151
434924914
467595142
466859621
429564034
424976833
466860852
466371135
470515549
434851173
434710819
429950833
467788879

View File

@ -0,0 +1,46 @@
292989214
316237292
321733631
323390570
327956364
332223498
333207452
334718372
344636875
362246081
366179419
380945052
382366116
387202452
389171486
391456740
391736837
394778487
401684600
402113224
402181770
402397014
405803396
445102363
445256897
448265376
449555622
449623976
458291624
458359181
463081566
469138353
471641628
476492237
478585901
478586066
479042264
479042269
479793787
481475385
483617247
486378555
486383912
492121213
497497599
502693599

View File

@ -0,0 +1,29 @@
511052670
520733219
524306810
526747539
528783089
532422720
532438210
534112077
534538571
534538682
535798742
536299372
539566148
539604165
540056900
541343431
541669780
541669996
541670397
541923319
542335994
543243585
543243654
543244170
543519140
543519615
543628379
543809340
543944737

View File

@ -0,0 +1,31 @@
334584772
406913630
407275419
337937633
337293427
404712928
451063582
451878128
425595958
536344026
532422548
423418540
423418395
532998065
540307575
423395975
508704368
481482392
466580448
423365707
423364758
422761666
422760156
422760148
422686965
492029971
510300817
512745032
514213638
527525440
534535767