diff --git a/configuration/datapoint_exclude_keyword.json b/configuration/aus_prospectus/datapoint_exclude_keyword.json similarity index 100% rename from configuration/datapoint_exclude_keyword.json rename to configuration/aus_prospectus/datapoint_exclude_keyword.json diff --git a/configuration/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json similarity index 100% rename from configuration/datapoint_keyword.json rename to configuration/aus_prospectus/datapoint_keyword.json diff --git a/configuration/datapoint_level.json b/configuration/aus_prospectus/datapoint_level.json similarity index 100% rename from configuration/datapoint_level.json rename to configuration/aus_prospectus/datapoint_level.json diff --git a/configuration/datapoint_name.json b/configuration/aus_prospectus/datapoint_name.json similarity index 100% rename from configuration/datapoint_name.json rename to configuration/aus_prospectus/datapoint_name.json diff --git a/configuration/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json similarity index 100% rename from configuration/datapoint_reported_name.json rename to configuration/aus_prospectus/datapoint_reported_name.json diff --git a/configuration/document_dp_pages.json b/configuration/aus_prospectus/document_dp_pages.json similarity index 93% rename from configuration/document_dp_pages.json rename to configuration/aus_prospectus/document_dp_pages.json index 4fe058c..940dc20 100644 --- a/configuration/document_dp_pages.json +++ b/configuration/aus_prospectus/document_dp_pages.json @@ -6,7 +6,6 @@ "552505237": [16, 17, 18, 19, 25, 26, 27], "552505278": [12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29], "554431052": [34, 35, 36, 41, 42, 43], - "554851189": [12, 13, 14], "555377021": [21, 22, 23, 24, 25, 26], "555654388": [35, 36] } diff --git a/configuration/domicile_datapoints.json b/configuration/aus_prospectus/domicile_datapoints.json similarity index 100% rename from configuration/domicile_datapoints.json rename to configuration/aus_prospectus/domicile_datapoints.json diff --git a/configuration/language.json b/configuration/aus_prospectus/language.json similarity index 100% rename from configuration/language.json rename to configuration/aus_prospectus/language.json diff --git a/configuration/aus_prospectus/misc_config.json b/configuration/aus_prospectus/misc_config.json new file mode 100644 index 0000000..1cd6c97 --- /dev/null +++ b/configuration/aus_prospectus/misc_config.json @@ -0,0 +1,3 @@ +{ + "apply_pdf2html": true +} \ No newline at end of file diff --git a/configuration/objective_strategy_regex.json b/configuration/aus_prospectus/objective_strategy_regex.json similarity index 100% rename from configuration/objective_strategy_regex.json rename to configuration/aus_prospectus/objective_strategy_regex.json diff --git a/core/data_extraction.py b/core/data_extraction.py index 5175264..51b9cf1 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -14,6 +14,7 @@ from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_sim class DataExtraction: def __init__( self, + doc_source: str, doc_id: str, pdf_file: str, output_data_folder: str, @@ -24,8 +25,11 @@ class DataExtraction: extract_way: str = "text", output_image_folder: str = None, ) -> None: + self.doc_source = doc_source self.doc_id = doc_id self.pdf_file = pdf_file + self.configuration_folder = f"./configuration/{doc_source}/" + self.instruction_folder = f"./instructions/{doc_source}/" if output_data_folder is None or len(output_data_folder) == 0: output_data_folder = r"/data/emea_ar/output/extract_data/docs/" os.makedirs(output_data_folder, exist_ok=True) @@ -75,7 +79,7 @@ class DataExtraction: def get_investment_objective_pages(self): investment_objective_pages = [] if self.document_type == 1: - objective_strategy_regex_config_file = r"./configuration/objective_strategy_regex.json" + objective_strategy_regex_config_file = os.path.join(self.configuration_folder, "objective_strategy_regex.json") with open(objective_strategy_regex_config_file, "r", encoding="utf-8") as f: objective_strategy_regex_config = json.load(f) objective_start_regex = objective_strategy_regex_config.get("objective_strategy", {}).get("start", "") @@ -89,7 +93,7 @@ class DataExtraction: return investment_objective_pages def get_datapoint_reported_name(self): - language_config_file = r"./configuration/language.json" + language_config_file = os.path.join(self.configuration_folder, "language.json") self.language_config = {} with open(language_config_file, "r", encoding="utf-8") as file: self.language_config = json.load(file) @@ -97,7 +101,7 @@ class DataExtraction: self.language_id = self.document_mapping_info_df["Language"].iloc[0] self.language = self.language_config.get(self.language_id, None) - datapoint_reported_name_config_file = r"./configuration/datapoint_reported_name.json" + datapoint_reported_name_config_file = os.path.join(self.configuration_folder, "datapoint_reported_name.json") all_datapoint_reported_name = {} with open(datapoint_reported_name_config_file, "r", encoding="utf-8") as file: all_datapoint_reported_name = json.load(file) @@ -139,19 +143,19 @@ class DataExtraction: output_folder=self.output_image_folder) def get_instructions_config(self) -> dict: - instructions_config_file = r"./instructions/data_extraction_prompts_config.json" + instructions_config_file = os.path.join(self.instruction_folder, "data_extraction_prompts_config.json") with open(instructions_config_file, "r", encoding="utf-8") as f: instructions_config = json.load(f) return instructions_config def get_datapoint_level(self) -> dict: - datapoint_level_file = r"./configuration/datapoint_level.json" + datapoint_level_file = os.path.join(self.configuration_folder, "datapoint_level.json") with open(datapoint_level_file, "r", encoding="utf-8") as f: datapoint_level = json.load(f) return datapoint_level def get_datapoint_name(self) -> dict: - datapoint_name_file = r"./configuration/datapoint_name.json" + datapoint_name_file = os.path.join(self.configuration_folder, "datapoint_name.json") with open(datapoint_name_file, "r", encoding="utf-8") as f: datapoint_name = json.load(f) return datapoint_name @@ -463,10 +467,10 @@ class DataExtraction: """ logger.info(f"Extracting data from page {page_num}") if self.document_type == 1: - pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}." - if pre_context in page_text: - page_text = page_text.replace(pre_context, "\n").strip() - + # pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}." + # if pre_context in page_text: + # page_text = page_text.replace(pre_context, "\n").strip() + pre_context = "" if len(self.investment_objective_pages) > 0: # Get the page number of the most recent investment objective at the top of the current page. diff_pages = [page_num - investment_objective_page for investment_objective_page @@ -479,7 +483,7 @@ class DataExtraction: page_text = page_text.replace(top_nearest_investment_objective_text, "").strip() pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n" # If can't find previous investment objective text, add the fund names to be the pre-fix of page text - page_text = f"{pre_context}\n{page_text}" + page_text = f"{pre_context}\n{page_text}".strip() instructions = self.get_instructions_by_datapoints( page_text, diff --git a/core/data_mapping.py b/core/data_mapping.py index 6288d36..f76bf54 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -17,9 +17,11 @@ class DataMapping: raw_document_data_list: list, document_mapping_info_df: pd.DataFrame, output_data_folder: str, + doc_source: str = "emea_ar" ): self.doc_id = doc_id self.datapoints = datapoints + self.doc_source = doc_source self.raw_document_data_list = raw_document_data_list if document_mapping_info_df is None or len(document_mapping_info_df) == 0: self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) diff --git a/core/page_filter.py b/core/page_filter.py index 748ed12..534720c 100644 --- a/core/page_filter.py +++ b/core/page_filter.py @@ -12,11 +12,18 @@ from utils.pdf_util import get_pdf_pages_by_html class FilterPages: def __init__( - self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, apply_pdf2html: bool = False + self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, doc_source: str = "emea_ar" ) -> None: self.doc_id = doc_id self.pdf_file = pdf_file - self.apply_pdf2html = apply_pdf2html + self.configuration_folder = f"./configuration/{doc_source}/" + misc_config_file = os.path.join(self.configuration_folder, "misc_config.json") + if os.path.exists(misc_config_file): + with open(misc_config_file, "r", encoding="utf-8") as file: + self.misc_config = json.load(file) + self.apply_pdf2html = self.misc_config.get("apply_pdf2html", False) + else: + self.apply_pdf2html = False self.page_text_dict = self.get_pdf_page_text_dict() if document_mapping_info_df is None or len(document_mapping_info_df) == 0: self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) @@ -31,7 +38,7 @@ class FilterPages: self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%" def get_document_dp_pages(self) -> dict: - document_dp_pages_file = r"./configuration/document_dp_pages.json" + document_dp_pages_file = os.path.join(self.configuration_folder, "document_dp_pages.json") with open(document_dp_pages_file, "r", encoding="utf-8") as file: self.document_dp_pages_config = json.load(file) self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, []) @@ -43,6 +50,7 @@ class FilterPages: if len(page_text_dict.keys()) == 0: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() + self.apply_pdf2html = False else: pdf_util = PDFUtil(self.pdf_file) success, text, page_text_dict = pdf_util.extract_text() @@ -52,12 +60,11 @@ class FilterPages: """ Remark: remove the """ - language_config_file = r"./configuration/language.json" - domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json" - datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json" - datapoint_exclude_keywords_config_file = ( - r"./configuration/datapoint_exclude_keyword.json" - ) + language_config_file = os.path.join(self.configuration_folder, "language.json") + domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json") + datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json") + datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json") + with open(language_config_file, "r", encoding="utf-8") as file: self.language_config = json.load(file) with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file: diff --git a/instructions/data_extraction_image_prompts.txt b/instructions/aus_prospectus/data_extraction_image_prompts.txt similarity index 100% rename from instructions/data_extraction_image_prompts.txt rename to instructions/aus_prospectus/data_extraction_image_prompts.txt diff --git a/instructions/data_extraction_prompts.txt b/instructions/aus_prospectus/data_extraction_prompts.txt similarity index 100% rename from instructions/data_extraction_prompts.txt rename to instructions/aus_prospectus/data_extraction_prompts.txt diff --git a/instructions/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json similarity index 100% rename from instructions/data_extraction_prompts_config.json rename to instructions/aus_prospectus/data_extraction_prompts_config.json diff --git a/instructions/table_extraction_image_optimize_prompts.txt b/instructions/aus_prospectus/table_extraction_image_optimize_prompts.txt similarity index 100% rename from instructions/table_extraction_image_optimize_prompts.txt rename to instructions/aus_prospectus/table_extraction_image_optimize_prompts.txt diff --git a/instructions/table_extraction_image_prompts.txt b/instructions/aus_prospectus/table_extraction_image_prompts.txt similarity index 100% rename from instructions/table_extraction_image_prompts.txt rename to instructions/aus_prospectus/table_extraction_image_prompts.txt diff --git a/instructions/table_extraction_image_prompts_v2.txt b/instructions/aus_prospectus/table_extraction_image_prompts_v2.txt similarity index 100% rename from instructions/table_extraction_image_prompts_v2.txt rename to instructions/aus_prospectus/table_extraction_image_prompts_v2.txt diff --git a/instructions/table_extraction_prompts.txt b/instructions/aus_prospectus/table_extraction_prompts.txt similarity index 100% rename from instructions/table_extraction_prompts.txt rename to instructions/aus_prospectus/table_extraction_prompts.txt diff --git a/instructions/text_extraction_image_prompts.txt b/instructions/aus_prospectus/text_extraction_image_prompts.txt similarity index 100% rename from instructions/text_extraction_image_prompts.txt rename to instructions/aus_prospectus/text_extraction_image_prompts.txt diff --git a/main.py b/main.py index d8e1acf..1ebcf98 100644 --- a/main.py +++ b/main.py @@ -23,15 +23,16 @@ class EMEA_AR_Parsing: def __init__( self, doc_id: str, + doc_source: str = "emea_ar", pdf_folder: str = r"/data/emea_ar/pdf/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", extract_way: str = "text", - apply_pdf2html: bool = False, drilldown_folder: str = r"/data/emea_ar/output/drilldown/", ) -> None: self.doc_id = doc_id + self.doc_source = doc_source self.pdf_folder = pdf_folder os.makedirs(self.pdf_folder, exist_ok=True) self.pdf_file = self.download_pdf() @@ -70,12 +71,12 @@ class EMEA_AR_Parsing: os.makedirs(self.output_mapping_data_folder, exist_ok=True) self.filter_pages = FilterPages( - self.doc_id, self.pdf_file, self.document_mapping_info_df, apply_pdf2html + self.doc_id, self.pdf_file, self.document_mapping_info_df, self.doc_source ) self.page_text_dict = self.filter_pages.page_text_dict try: os.makedirs(output_pdf_text_folder, exist_ok=True) - if apply_pdf2html: + if self.filter_pages.apply_pdf2html: output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/") else: output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/") @@ -132,6 +133,7 @@ class EMEA_AR_Parsing: if not found_data: try: data_extraction = DataExtraction( + self.doc_source, self.doc_id, self.pdf_file, self.output_extract_data_folder, @@ -263,15 +265,16 @@ class EMEA_AR_Parsing: return data_mapping.mapping_raw_data() -def filter_pages(doc_id: str, pdf_folder: str) -> None: +def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None: logger.info(f"Filter EMEA AR PDF pages for doc_id: {doc_id}") - emea_ar_parsing = EMEA_AR_Parsing(doc_id, pdf_folder) + emea_ar_parsing = EMEA_AR_Parsing(doc_id, doc_source=doc_source, pdf_folder=pdf_folder) datapoint_page_info, result_details = emea_ar_parsing.get_datapoint_page_info() return datapoint_page_info, result_details def extract_data( doc_id: str, + doc_source: str, pdf_folder: str, output_data_folder: str, extract_way: str = "text", @@ -280,7 +283,8 @@ def extract_data( logger.info(f"Extract EMEA AR data for doc_id: {doc_id}") emea_ar_parsing = EMEA_AR_Parsing( doc_id, - pdf_folder, + doc_source=doc_source, + pdf_folder=pdf_folder, output_extract_data_folder=output_data_folder, extract_way=extract_way, ) @@ -294,8 +298,8 @@ def mapping_data( output_pdf_text_folder: str, output_extract_data_folder: str, output_mapping_folder: str, + doc_source: str = "emea_ar", extract_way: str = "text", - apply_pdf2html: bool = False, drilldown_folder: str = r"/data/emea_ar/output/drilldown/", re_run_extract_data: bool = False, re_run_mapping_data: bool = False, @@ -303,12 +307,12 @@ def mapping_data( logger.info(f"Extract EMEA AR data for doc_id: {doc_id}") emea_ar_parsing = EMEA_AR_Parsing( doc_id, - pdf_folder, + doc_source=doc_source, + pdf_folder=pdf_folder, output_pdf_text_folder=output_pdf_text_folder, output_extract_data_folder=output_extract_data_folder, output_mapping_data_folder=output_mapping_folder, extract_way=extract_way, - apply_pdf2html=apply_pdf2html, drilldown_folder=drilldown_folder, ) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) @@ -320,6 +324,7 @@ def mapping_data( def batch_extract_data( pdf_folder: str, + doc_source: str = "emea_ar", doc_data_excel_file: str = None, output_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_total_folder: str = r"/data/emea_ar/output/extract_data/total/", @@ -349,6 +354,7 @@ def batch_extract_data( continue data_from_gpt = extract_data( doc_id=doc_id, + doc_source=doc_source, pdf_folder=pdf_folder, output_data_folder=output_child_folder, extract_way=extract_way, @@ -372,7 +378,8 @@ def batch_extract_data( def batch_start_job( - pdf_folder: str, + doc_source: str = "emea_ar", + pdf_folder: str = "/data/emea_ar/pdf/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", doc_data_excel_file: str = None, output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", @@ -380,7 +387,6 @@ def batch_start_job( output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/", extract_way: str = "text", - apply_pdf2html: bool = False, drilldown_folder: str = r"/data/emea_ar/output/drilldown/", special_doc_id_list: list = None, re_run_extract_data: bool = False, @@ -418,8 +424,8 @@ def batch_start_job( output_pdf_text_folder=output_pdf_text_folder, output_extract_data_folder=output_extract_data_child_folder, output_mapping_folder=output_mapping_child_folder, + doc_source=doc_source, extract_way=extract_way, - apply_pdf2html=apply_pdf2html, drilldown_folder=drilldown_folder, re_run_extract_data=re_run_extract_data, re_run_mapping_data=re_run_mapping_data, @@ -531,6 +537,7 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None: def batch_filter_pdf_files( pdf_folder: str, + doc_source: str = "emea_ar", doc_data_excel_file: str = None, output_folder: str = r"/data/emea_ar/output/filter_pages/", special_doc_id_list: list = None, @@ -556,7 +563,7 @@ def batch_filter_pdf_files( if doc_list is not None and doc_id not in doc_list: continue doc_datapoint_page_info, doc_result_details = filter_pages( - doc_id=doc_id, pdf_folder=pdf_folder + doc_id=doc_id, pdf_folder=pdf_folder, doc_source=doc_source ) result_list.append(doc_datapoint_page_info) result_details.extend(doc_result_details) @@ -631,7 +638,7 @@ def test_auto_generate_instructions(): datapoint_list.remove("doc_id") data_extraction = DataExtraction( - doc_id, pdf_file, page_text_dict, datapoint_page_info, document_mapping_info_df + "emear_ar", doc_id, pdf_file, page_text_dict, datapoint_page_info, document_mapping_info_df ) page_index_list = list(page_text_dict.keys()) if len(page_index_list) > 0: @@ -898,15 +905,15 @@ def replace_rerun_data(new_data_file: str, original_data_file: str): new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet) -def batch_run_documents(special_doc_id_list: list = None, +def batch_run_documents(doc_source: str = "emea_ar", + special_doc_id_list: list = None, pdf_folder:str = r"/data/emea_ar/pdf/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/", output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/", - drilldown_folder: str = r"/data/emea_ar/output/drilldown/", - apply_pdf2html: bool = False): + drilldown_folder: str = r"/data/emea_ar/output/drilldown/"): sample_document_list_folder = r'./sample_documents/' document_list_files = glob(sample_document_list_folder + "*.txt") page_filter_ground_truth_file = ( @@ -914,7 +921,7 @@ def batch_run_documents(special_doc_id_list: list = None, ) re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_way = "text" @@ -932,6 +939,7 @@ def batch_run_documents(special_doc_id_list: list = None, doc_id_list = f.readlines() doc_id_list = [doc_id.strip() for doc_id in doc_id_list] batch_start_job( + doc_source, pdf_folder, output_pdf_text_folder, page_filter_ground_truth_file, @@ -940,7 +948,6 @@ def batch_run_documents(special_doc_id_list: list = None, output_extract_data_total_folder, output_mapping_total_folder, extract_way, - apply_pdf2html, drilldown_folder, doc_id_list, re_run_extract_data, @@ -951,6 +958,7 @@ def batch_run_documents(special_doc_id_list: list = None, ) else: batch_start_job( + doc_source, pdf_folder, output_pdf_text_folder, page_filter_ground_truth_file, @@ -959,7 +967,6 @@ def batch_run_documents(special_doc_id_list: list = None, output_extract_data_total_folder, output_mapping_total_folder, extract_way, - apply_pdf2html, drilldown_folder, special_doc_id_list, re_run_extract_data, @@ -971,6 +978,7 @@ def batch_run_documents(special_doc_id_list: list = None, def batch_initial_document(sample_document_list_folder: str = r'./sample_documents/', document_list_file: str = "sample_document_complex.txt", + doc_source: str = "emea_ar", pdf_folder: str = r"/data/emea_ar/pdf/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/"): @@ -981,6 +989,7 @@ def batch_initial_document(sample_document_list_folder: str = r'./sample_documen for doc_id in tqdm(doc_id_list): logger.info(f"Start to initial document: {doc_id}") emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, + doc_source=doc_source, pdf_folder=pdf_folder, output_extract_data_folder=output_extract_data_child_folder, output_mapping_data_folder=output_mapping_child_folder) @@ -1154,27 +1163,31 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - special_doc_id_list: list = ["539790009", - "542300403", - "542301117", - "542306317", - "547567013", - "552505237", - "552505278", - "554431052", - "554851189", - "555377021", - "555654388"] - # special_doc_id_list: list = ["542301117"] - pdf_folder:str = r"/data/aus_prospectus/pdf/" - output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/" - output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/" - output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/" - output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/" - output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/" - drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - apply_pdf2html = True - batch_run_documents(special_doc_id_list=special_doc_id_list, + doc_source = "aus_prospectus" + if doc_source == "aus_prospectus": + special_doc_id_list: list = ["539790009", + "542300403", + "542301117", + "542306317", + "547567013", + "552505237", + "552505278", + "554431052", + "554851189", + "555377021", + "555654388"] + special_doc_id_list: list = ["554851189"] + pdf_folder:str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/" + output_extract_data_total_folder:str = r"/data/aus_prospectus/output/extract_data/total/" + output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/" + output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/" + drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + elif doc_source == "emea_ar": + special_doc_id_list = ["553242411"] + batch_run_documents(doc_source=doc_source, + special_doc_id_list=special_doc_id_list, pdf_folder=pdf_folder, output_pdf_text_folder=output_pdf_text_folder, output_extract_data_child_folder=output_extract_data_child_folder, @@ -1182,7 +1195,6 @@ if __name__ == "__main__": output_mapping_child_folder=output_mapping_child_folder, output_mapping_total_folder=output_mapping_total_folder, drilldown_folder=drilldown_folder, - apply_pdf2html=apply_pdf2html ) # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" diff --git a/test_specific_biz_logic.py b/test_specific_biz_logic.py index 32f5d2d..f1006f3 100644 --- a/test_specific_biz_logic.py +++ b/test_specific_biz_logic.py @@ -22,6 +22,7 @@ def test_validate_extraction_data(): datapoint_page_info, result_details = get_datapoint_page_info(filter_pages) datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info) data_extraction = DataExtraction( + doc_source="emea_ar", doc_id=document_id, pdf_file=pdf_file, output_data_folder=output_extract_data_child_folder,