simplify code
This commit is contained in:
parent
75ea5e70de
commit
f71e2968cc
|
|
@ -171,8 +171,6 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
if page_num > 160:
|
||||
break
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
page_datapoints = self.get_datapoints_by_page_num(page_num)
|
||||
|
|
|
|||
358
main.py
358
main.py
|
|
@ -869,329 +869,10 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
|
|||
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
|
||||
|
||||
|
||||
def batch_run_documents():
|
||||
# special_doc_id_list = ["505174428", "510326848", "349679479"]
|
||||
# check_mapping_doc_id_list = [
|
||||
# "327956364",
|
||||
# "391456740",
|
||||
# "391736837",
|
||||
# "458359181",
|
||||
# "486383912",
|
||||
# "497497599",
|
||||
# "529925114",
|
||||
# "321733631",
|
||||
# "334718372",
|
||||
# "344636875",
|
||||
# "362246081",
|
||||
# "445256897",
|
||||
# "449623976",
|
||||
# "458291624",
|
||||
# "478585901",
|
||||
# "492121213",
|
||||
# "502821436",
|
||||
# "507967525",
|
||||
# "481475385",
|
||||
# "508854243",
|
||||
# "520879048",
|
||||
# "402181770",
|
||||
# "463081566",
|
||||
# "502693599",
|
||||
# "509845549",
|
||||
# "389171486",
|
||||
# "323390570",
|
||||
# "366179419",
|
||||
# "486378555",
|
||||
# "506559375",
|
||||
# "479793787",
|
||||
# "471641628",
|
||||
# ]
|
||||
# English documents with ground truth
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "292989214",
|
||||
# "316237292",
|
||||
# "321733631",
|
||||
# "323390570",
|
||||
# "327956364",
|
||||
# "332223498",
|
||||
# "333207452",
|
||||
# "334718372",
|
||||
# "344636875",
|
||||
# "362246081",
|
||||
# "366179419",
|
||||
# "380945052",
|
||||
# "382366116",
|
||||
# "387202452",
|
||||
# "389171486",
|
||||
# "391456740",
|
||||
# "391736837",
|
||||
# "394778487",
|
||||
# "401684600",
|
||||
# "402113224",
|
||||
# "402181770",
|
||||
# "402397014",
|
||||
# "405803396",
|
||||
# "445102363",
|
||||
# "445256897",
|
||||
# "448265376",
|
||||
# "449555622",
|
||||
# "449623976",
|
||||
# "458291624",
|
||||
# "458359181",
|
||||
# "463081566",
|
||||
# "469138353",
|
||||
# "471641628",
|
||||
# "476492237",
|
||||
# "478585901",
|
||||
# "478586066",
|
||||
# "479042264",
|
||||
# "479042269",
|
||||
# "479793787",
|
||||
# "481475385",
|
||||
# "483617247",
|
||||
# "486378555",
|
||||
# "486383912",
|
||||
# "492121213",
|
||||
# "497497599",
|
||||
# "502693599"
|
||||
# ]
|
||||
def batch_run_documents():
|
||||
sample_document_list_folder = r'./sample_documents/'
|
||||
document_list_files = glob(sample_document_list_folder + "*.txt")
|
||||
|
||||
# Documents in EMEA Case 1.docx
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "424976833",
|
||||
# "425480144",
|
||||
# "427637151",
|
||||
# "429564034",
|
||||
# "429950833",
|
||||
# "430240853",
|
||||
# "431073795",
|
||||
# "434710819",
|
||||
# "434851173",
|
||||
# "434902020",
|
||||
# "434924914",
|
||||
# "435128656",
|
||||
# "440029306",
|
||||
# "466371135",
|
||||
# "466528487",
|
||||
# "466859621",
|
||||
# "466860852",
|
||||
# "467595142",
|
||||
# "467788879",
|
||||
# "470515549"
|
||||
# ]
|
||||
|
||||
# documents in New EMEA Documents sample.xlsx as typical documents
|
||||
# """
|
||||
# Below 9 documents can't get data by keywords or ChatGPT
|
||||
# 526747539,
|
||||
# 534112077,
|
||||
# 535798742,
|
||||
# 536299372,
|
||||
# 539566148,
|
||||
# 541343431,
|
||||
# 541923319,
|
||||
# 543243585,
|
||||
# 543243654
|
||||
# """
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "511052670",
|
||||
# "520733219",
|
||||
# "524306810",
|
||||
# "526747539",
|
||||
# "528783089",
|
||||
# "532422720",
|
||||
# "532438210",
|
||||
# "534112077",
|
||||
# "534538571",
|
||||
# "534538682",
|
||||
# "535798742",
|
||||
# "536299372",
|
||||
# "539566148",
|
||||
# "539604165",
|
||||
# "540056900",
|
||||
# "541343431",
|
||||
# "541669780",
|
||||
# "541669996",
|
||||
# "541670397",
|
||||
# "541923319",
|
||||
# "542335994",
|
||||
# "543243585",
|
||||
# "543243654",
|
||||
# "543244170",
|
||||
# "543519140",
|
||||
# "543519615",
|
||||
# "543628379",
|
||||
# "543809340",
|
||||
# "543944737"
|
||||
# ]
|
||||
|
||||
# documents in Final list of EMEA documents.xlsx as typical documents
|
||||
# check_db_mapping_doc_id_list = [
|
||||
# "532500349",
|
||||
# "535324239",
|
||||
# "532442891",
|
||||
# "543243650",
|
||||
# "528588598",
|
||||
# "532437639",
|
||||
# "527525440",
|
||||
# "534987291",
|
||||
# "534112055",
|
||||
# "533482585",
|
||||
# "544208174",
|
||||
# "534547266",
|
||||
# "544713166",
|
||||
# "526463547",
|
||||
# "534535569",
|
||||
# "534106067",
|
||||
# "532486560",
|
||||
# "532781760",
|
||||
# "533727067",
|
||||
# "527256381",
|
||||
# "533392425",
|
||||
# "532179676",
|
||||
# "534300608",
|
||||
# "539233950",
|
||||
# # "533727908",
|
||||
# "532438414",
|
||||
# "533681744",
|
||||
# "537654645",
|
||||
# "533594905",
|
||||
# "537926443",
|
||||
# "533499655",
|
||||
# "533862814",
|
||||
# "544918611",
|
||||
# "539087870",
|
||||
# "536343790"
|
||||
# ]
|
||||
|
||||
# document samples 2024-11-06
|
||||
# check_db_mapping_doc_id_list = ["546483469",
|
||||
# "546375582",
|
||||
# "546375575",
|
||||
# "546375576",
|
||||
# "546375577",
|
||||
# "546375568",
|
||||
# "546371033",
|
||||
# "546632761",
|
||||
# "546632544",
|
||||
# "546632464",
|
||||
# "546724583",
|
||||
# "546724552",
|
||||
# "546694677",
|
||||
# "546660422",
|
||||
# "546638908",
|
||||
# "546632845",
|
||||
# "546105299",
|
||||
# "546085481",
|
||||
# "546078693",
|
||||
# "546078650",
|
||||
# "546289930",
|
||||
# "546289910",
|
||||
# "542967371",
|
||||
# "542798238",
|
||||
# "546048730",
|
||||
# "546048143",
|
||||
# "546047619",
|
||||
# "546047528",
|
||||
# "546046730",
|
||||
# "546919329"]
|
||||
|
||||
# document samples: 30 documents, all with 4 data points
|
||||
# check_db_mapping_doc_id_list = ["479742284",
|
||||
# "501380497",
|
||||
# "501380553",
|
||||
# "501380775",
|
||||
# "501380801",
|
||||
# "501600428",
|
||||
# "501600429",
|
||||
# "501600541",
|
||||
# "501600549",
|
||||
# "503659548",
|
||||
# "506326520",
|
||||
# "507720522",
|
||||
# "507928179",
|
||||
# "508981020",
|
||||
# "509133771",
|
||||
# "509743502",
|
||||
# "514636951",
|
||||
# "514636952",
|
||||
# "514636953",
|
||||
# "514636954",
|
||||
# "514636955",
|
||||
# "514636957",
|
||||
# "514636958",
|
||||
# "514636959",
|
||||
# "514636985",
|
||||
# "514636988",
|
||||
# "514636990",
|
||||
# "514636993",
|
||||
# "514636994",
|
||||
# "539794746",
|
||||
# ]
|
||||
# Sample documents with special cases
|
||||
check_db_mapping_doc_id_list = [
|
||||
"334584772",
|
||||
"406913630",
|
||||
"407275419",
|
||||
"337937633",
|
||||
"337293427",
|
||||
"404712928",
|
||||
"451063582",
|
||||
"451878128",
|
||||
"425595958",
|
||||
"536344026",
|
||||
"532422548",
|
||||
"423418540",
|
||||
"423418395",
|
||||
"532998065",
|
||||
"540307575",
|
||||
"423395975",
|
||||
"508704368",
|
||||
"481482392",
|
||||
"466580448",
|
||||
"423365707",
|
||||
"423364758",
|
||||
"422761666",
|
||||
"422760156",
|
||||
"422760148",
|
||||
"422686965",
|
||||
"492029971",
|
||||
"510300817",
|
||||
"512745032",
|
||||
"514213638",
|
||||
"527525440",
|
||||
"534535767"
|
||||
]
|
||||
# total_data_prefix = "complex_doc_"
|
||||
# documents from EMEA Case 1.docx
|
||||
check_db_mapping_doc_id_list = [
|
||||
"510483464",
|
||||
"525412280",
|
||||
"528208797",
|
||||
"435128656",
|
||||
"425480144",
|
||||
"466528487",
|
||||
"434902020",
|
||||
"440029306",
|
||||
"431073795",
|
||||
"430240853",
|
||||
"427637151",
|
||||
"434924914",
|
||||
"467595142",
|
||||
"466859621",
|
||||
"429564034",
|
||||
"424976833",
|
||||
"466860852",
|
||||
"466371135",
|
||||
"470515549",
|
||||
"434851173",
|
||||
"434710819",
|
||||
"429950833",
|
||||
"467788879"
|
||||
]
|
||||
total_data_prefix = "complex_doc_from_word"
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
special_doc_id_list = ["334584772", "435128656"]
|
||||
special_doc_id_list = ["514213638"]
|
||||
pdf_folder = r"/data/emea_ar/pdf/"
|
||||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
|
|
@ -1205,8 +886,36 @@ def batch_run_documents():
|
|||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
for extract_way in extract_ways:
|
||||
extract_way = "text"
|
||||
special_doc_id_list = []
|
||||
if len(special_doc_id_list) == 0:
|
||||
force_save_total_data = True
|
||||
file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
|
||||
for document_list_file in document_list_files:
|
||||
file_base_name = os.path.basename(document_list_file).replace(".txt", "")
|
||||
if (file_base_name_candidates is not None and
|
||||
len(file_base_name_candidates) > 0 and
|
||||
file_base_name not in file_base_name_candidates):
|
||||
continue
|
||||
with open(document_list_file, "r", encoding="utf-8") as f:
|
||||
doc_id_list = f.readlines()
|
||||
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
|
||||
batch_start_job(
|
||||
pdf_folder,
|
||||
page_filter_ground_truth_file,
|
||||
output_extract_data_child_folder,
|
||||
output_mapping_child_folder,
|
||||
output_extract_data_total_folder,
|
||||
output_mapping_total_folder,
|
||||
extract_way,
|
||||
doc_id_list,
|
||||
re_run_extract_data,
|
||||
re_run_mapping_data,
|
||||
force_save_total_data=force_save_total_data,
|
||||
calculate_metrics=calculate_metrics,
|
||||
total_data_prefix=file_base_name
|
||||
)
|
||||
else:
|
||||
batch_start_job(
|
||||
pdf_folder,
|
||||
page_filter_ground_truth_file,
|
||||
|
|
@ -1220,7 +929,6 @@ def batch_run_documents():
|
|||
re_run_mapping_data,
|
||||
force_save_total_data=force_save_total_data,
|
||||
calculate_metrics=calculate_metrics,
|
||||
total_data_prefix=total_data_prefix
|
||||
)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
532500349
|
||||
535324239
|
||||
532442891
|
||||
543243650
|
||||
528588598
|
||||
532437639
|
||||
527525440
|
||||
534987291
|
||||
534112055
|
||||
533482585
|
||||
544208174
|
||||
534547266
|
||||
544713166
|
||||
526463547
|
||||
534535569
|
||||
534106067
|
||||
532486560
|
||||
532781760
|
||||
533727067
|
||||
527256381
|
||||
533392425
|
||||
532179676
|
||||
534300608
|
||||
539233950
|
||||
532438414
|
||||
533681744
|
||||
537654645
|
||||
533594905
|
||||
537926443
|
||||
533499655
|
||||
533862814
|
||||
544918611
|
||||
539087870
|
||||
536343790
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
479742284
|
||||
501380497
|
||||
501380553
|
||||
501380775
|
||||
501380801
|
||||
501600428
|
||||
501600429
|
||||
501600541
|
||||
501600549
|
||||
503659548
|
||||
506326520
|
||||
507720522
|
||||
507928179
|
||||
508981020
|
||||
509133771
|
||||
509743502
|
||||
514636951
|
||||
514636952
|
||||
514636953
|
||||
514636954
|
||||
514636955
|
||||
514636957
|
||||
514636958
|
||||
514636959
|
||||
514636985
|
||||
514636988
|
||||
514636990
|
||||
514636993
|
||||
514636994
|
||||
539794746
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
510483464
|
||||
525412280
|
||||
528208797
|
||||
435128656
|
||||
425480144
|
||||
466528487
|
||||
434902020
|
||||
440029306
|
||||
431073795
|
||||
430240853
|
||||
427637151
|
||||
434924914
|
||||
467595142
|
||||
466859621
|
||||
429564034
|
||||
424976833
|
||||
466860852
|
||||
466371135
|
||||
470515549
|
||||
434851173
|
||||
434710819
|
||||
429950833
|
||||
467788879
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
292989214
|
||||
316237292
|
||||
321733631
|
||||
323390570
|
||||
327956364
|
||||
332223498
|
||||
333207452
|
||||
334718372
|
||||
344636875
|
||||
362246081
|
||||
366179419
|
||||
380945052
|
||||
382366116
|
||||
387202452
|
||||
389171486
|
||||
391456740
|
||||
391736837
|
||||
394778487
|
||||
401684600
|
||||
402113224
|
||||
402181770
|
||||
402397014
|
||||
405803396
|
||||
445102363
|
||||
445256897
|
||||
448265376
|
||||
449555622
|
||||
449623976
|
||||
458291624
|
||||
458359181
|
||||
463081566
|
||||
469138353
|
||||
471641628
|
||||
476492237
|
||||
478585901
|
||||
478586066
|
||||
479042264
|
||||
479042269
|
||||
479793787
|
||||
481475385
|
||||
483617247
|
||||
486378555
|
||||
486383912
|
||||
492121213
|
||||
497497599
|
||||
502693599
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
511052670
|
||||
520733219
|
||||
524306810
|
||||
526747539
|
||||
528783089
|
||||
532422720
|
||||
532438210
|
||||
534112077
|
||||
534538571
|
||||
534538682
|
||||
535798742
|
||||
536299372
|
||||
539566148
|
||||
539604165
|
||||
540056900
|
||||
541343431
|
||||
541669780
|
||||
541669996
|
||||
541670397
|
||||
541923319
|
||||
542335994
|
||||
543243585
|
||||
543243654
|
||||
543244170
|
||||
543519140
|
||||
543519615
|
||||
543628379
|
||||
543809340
|
||||
543944737
|
||||
|
|
@ -0,0 +1,31 @@
|
|||
334584772
|
||||
406913630
|
||||
407275419
|
||||
337937633
|
||||
337293427
|
||||
404712928
|
||||
451063582
|
||||
451878128
|
||||
425595958
|
||||
536344026
|
||||
532422548
|
||||
423418540
|
||||
423418395
|
||||
532998065
|
||||
540307575
|
||||
423395975
|
||||
508704368
|
||||
481482392
|
||||
466580448
|
||||
423365707
|
||||
423364758
|
||||
422761666
|
||||
422760156
|
||||
422760148
|
||||
422686965
|
||||
492029971
|
||||
510300817
|
||||
512745032
|
||||
514213638
|
||||
527525440
|
||||
534535767
|
||||
Loading…
Reference in New Issue