support load configurations by doc_source parameter

This commit is contained in:
Blade He 2025-01-16 11:17:48 -06:00
parent acc30d4b72
commit 9f0e77a11e
23 changed files with 91 additions and 63 deletions

View File

@ -6,7 +6,6 @@
"552505237": [16, 17, 18, 19, 25, 26, 27],
"552505278": [12, 13, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29],
"554431052": [34, 35, 36, 41, 42, 43],
"554851189": [12, 13, 14],
"555377021": [21, 22, 23, 24, 25, 26],
"555654388": [35, 36]
}

View File

@ -0,0 +1,3 @@
{
"apply_pdf2html": true
}

View File

@ -14,6 +14,7 @@ from utils.biz_utils import add_slash_to_text_as_regex, clean_text, get_most_sim
class DataExtraction:
def __init__(
self,
doc_source: str,
doc_id: str,
pdf_file: str,
output_data_folder: str,
@ -24,8 +25,11 @@ class DataExtraction:
extract_way: str = "text",
output_image_folder: str = None,
) -> None:
self.doc_source = doc_source
self.doc_id = doc_id
self.pdf_file = pdf_file
self.configuration_folder = f"./configuration/{doc_source}/"
self.instruction_folder = f"./instructions/{doc_source}/"
if output_data_folder is None or len(output_data_folder) == 0:
output_data_folder = r"/data/emea_ar/output/extract_data/docs/"
os.makedirs(output_data_folder, exist_ok=True)
@ -75,7 +79,7 @@ class DataExtraction:
def get_investment_objective_pages(self):
investment_objective_pages = []
if self.document_type == 1:
objective_strategy_regex_config_file = r"./configuration/objective_strategy_regex.json"
objective_strategy_regex_config_file = os.path.join(self.configuration_folder, "objective_strategy_regex.json")
with open(objective_strategy_regex_config_file, "r", encoding="utf-8") as f:
objective_strategy_regex_config = json.load(f)
objective_start_regex = objective_strategy_regex_config.get("objective_strategy", {}).get("start", "")
@ -89,7 +93,7 @@ class DataExtraction:
return investment_objective_pages
def get_datapoint_reported_name(self):
language_config_file = r"./configuration/language.json"
language_config_file = os.path.join(self.configuration_folder, "language.json")
self.language_config = {}
with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file)
@ -97,7 +101,7 @@ class DataExtraction:
self.language_id = self.document_mapping_info_df["Language"].iloc[0]
self.language = self.language_config.get(self.language_id, None)
datapoint_reported_name_config_file = r"./configuration/datapoint_reported_name.json"
datapoint_reported_name_config_file = os.path.join(self.configuration_folder, "datapoint_reported_name.json")
all_datapoint_reported_name = {}
with open(datapoint_reported_name_config_file, "r", encoding="utf-8") as file:
all_datapoint_reported_name = json.load(file)
@ -139,19 +143,19 @@ class DataExtraction:
output_folder=self.output_image_folder)
def get_instructions_config(self) -> dict:
instructions_config_file = r"./instructions/data_extraction_prompts_config.json"
instructions_config_file = os.path.join(self.instruction_folder, "data_extraction_prompts_config.json")
with open(instructions_config_file, "r", encoding="utf-8") as f:
instructions_config = json.load(f)
return instructions_config
def get_datapoint_level(self) -> dict:
datapoint_level_file = r"./configuration/datapoint_level.json"
datapoint_level_file = os.path.join(self.configuration_folder, "datapoint_level.json")
with open(datapoint_level_file, "r", encoding="utf-8") as f:
datapoint_level = json.load(f)
return datapoint_level
def get_datapoint_name(self) -> dict:
datapoint_name_file = r"./configuration/datapoint_name.json"
datapoint_name_file = os.path.join(self.configuration_folder, "datapoint_name.json")
with open(datapoint_name_file, "r", encoding="utf-8") as f:
datapoint_name = json.load(f)
return datapoint_name
@ -463,10 +467,10 @@ class DataExtraction:
"""
logger.info(f"Extracting data from page {page_num}")
if self.document_type == 1:
pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}."
if pre_context in page_text:
page_text = page_text.replace(pre_context, "\n").strip()
# pre_context = f"The document type is prospectus. \nThe fund names in this document are {', '.join(self.fund_name_list)}."
# if pre_context in page_text:
# page_text = page_text.replace(pre_context, "\n").strip()
pre_context = ""
if len(self.investment_objective_pages) > 0:
# Get the page number of the most recent investment objective at the top of the current page.
diff_pages = [page_num - investment_objective_page for investment_objective_page
@ -479,7 +483,7 @@ class DataExtraction:
page_text = page_text.replace(top_nearest_investment_objective_text, "").strip()
pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n"
# If can't find previous investment objective text, add the fund names to be the pre-fix of page text
page_text = f"{pre_context}\n{page_text}"
page_text = f"{pre_context}\n{page_text}".strip()
instructions = self.get_instructions_by_datapoints(
page_text,

View File

@ -17,9 +17,11 @@ class DataMapping:
raw_document_data_list: list,
document_mapping_info_df: pd.DataFrame,
output_data_folder: str,
doc_source: str = "emea_ar"
):
self.doc_id = doc_id
self.datapoints = datapoints
self.doc_source = doc_source
self.raw_document_data_list = raw_document_data_list
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)

View File

@ -12,11 +12,18 @@ from utils.pdf_util import get_pdf_pages_by_html
class FilterPages:
def __init__(
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, apply_pdf2html: bool = False
self, doc_id: str, pdf_file: str, document_mapping_info_df: pd.DataFrame, doc_source: str = "emea_ar"
) -> None:
self.doc_id = doc_id
self.pdf_file = pdf_file
self.apply_pdf2html = apply_pdf2html
self.configuration_folder = f"./configuration/{doc_source}/"
misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as file:
self.misc_config = json.load(file)
self.apply_pdf2html = self.misc_config.get("apply_pdf2html", False)
else:
self.apply_pdf2html = False
self.page_text_dict = self.get_pdf_page_text_dict()
if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
@ -31,7 +38,7 @@ class FilterPages:
self.percentage_regex = r"\b\d{1,3}\.\d+\b|\b\d{1,3}(\,\d+)\b%"
def get_document_dp_pages(self) -> dict:
document_dp_pages_file = r"./configuration/document_dp_pages.json"
document_dp_pages_file = os.path.join(self.configuration_folder, "document_dp_pages.json")
with open(document_dp_pages_file, "r", encoding="utf-8") as file:
self.document_dp_pages_config = json.load(file)
self.document_dp_pages = self.document_dp_pages_config.get(self.doc_id, [])
@ -43,6 +50,7 @@ class FilterPages:
if len(page_text_dict.keys()) == 0:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
self.apply_pdf2html = False
else:
pdf_util = PDFUtil(self.pdf_file)
success, text, page_text_dict = pdf_util.extract_text()
@ -52,12 +60,11 @@ class FilterPages:
"""
Remark: remove the
"""
language_config_file = r"./configuration/language.json"
domicile_datapoint_config_file = r"./configuration/domicile_datapoints.json"
datapoint_keywords_config_file = r"./configuration/datapoint_keyword.json"
datapoint_exclude_keywords_config_file = (
r"./configuration/datapoint_exclude_keyword.json"
)
language_config_file = os.path.join(self.configuration_folder, "language.json")
domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json")
datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json")
datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json")
with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file)
with open(domicile_datapoint_config_file, "r", encoding="utf-8") as file:

60
main.py
View File

@ -23,15 +23,16 @@ class EMEA_AR_Parsing:
def __init__(
self,
doc_id: str,
doc_source: str = "emea_ar",
pdf_folder: str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
extract_way: str = "text",
apply_pdf2html: bool = False,
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
) -> None:
self.doc_id = doc_id
self.doc_source = doc_source
self.pdf_folder = pdf_folder
os.makedirs(self.pdf_folder, exist_ok=True)
self.pdf_file = self.download_pdf()
@ -70,12 +71,12 @@ class EMEA_AR_Parsing:
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
self.filter_pages = FilterPages(
self.doc_id, self.pdf_file, self.document_mapping_info_df, apply_pdf2html
self.doc_id, self.pdf_file, self.document_mapping_info_df, self.doc_source
)
self.page_text_dict = self.filter_pages.page_text_dict
try:
os.makedirs(output_pdf_text_folder, exist_ok=True)
if apply_pdf2html:
if self.filter_pages.apply_pdf2html:
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pdf2html/")
else:
output_pdf_text_folder = os.path.join(output_pdf_text_folder, "pymupdf/")
@ -132,6 +133,7 @@ class EMEA_AR_Parsing:
if not found_data:
try:
data_extraction = DataExtraction(
self.doc_source,
self.doc_id,
self.pdf_file,
self.output_extract_data_folder,
@ -263,15 +265,16 @@ class EMEA_AR_Parsing:
return data_mapping.mapping_raw_data()
def filter_pages(doc_id: str, pdf_folder: str) -> None:
def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None:
logger.info(f"Filter EMEA AR PDF pages for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(doc_id, pdf_folder)
emea_ar_parsing = EMEA_AR_Parsing(doc_id, doc_source=doc_source, pdf_folder=pdf_folder)
datapoint_page_info, result_details = emea_ar_parsing.get_datapoint_page_info()
return datapoint_page_info, result_details
def extract_data(
doc_id: str,
doc_source: str,
pdf_folder: str,
output_data_folder: str,
extract_way: str = "text",
@ -280,7 +283,8 @@ def extract_data(
logger.info(f"Extract EMEA AR data for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(
doc_id,
pdf_folder,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_extract_data_folder=output_data_folder,
extract_way=extract_way,
)
@ -294,8 +298,8 @@ def mapping_data(
output_pdf_text_folder: str,
output_extract_data_folder: str,
output_mapping_folder: str,
doc_source: str = "emea_ar",
extract_way: str = "text",
apply_pdf2html: bool = False,
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
re_run_extract_data: bool = False,
re_run_mapping_data: bool = False,
@ -303,12 +307,12 @@ def mapping_data(
logger.info(f"Extract EMEA AR data for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(
doc_id,
pdf_folder,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_folder,
extract_way=extract_way,
apply_pdf2html=apply_pdf2html,
drilldown_folder=drilldown_folder,
)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
@ -320,6 +324,7 @@ def mapping_data(
def batch_extract_data(
pdf_folder: str,
doc_source: str = "emea_ar",
doc_data_excel_file: str = None,
output_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
@ -349,6 +354,7 @@ def batch_extract_data(
continue
data_from_gpt = extract_data(
doc_id=doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_data_folder=output_child_folder,
extract_way=extract_way,
@ -372,7 +378,8 @@ def batch_extract_data(
def batch_start_job(
pdf_folder: str,
doc_source: str = "emea_ar",
pdf_folder: str = "/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
doc_data_excel_file: str = None,
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
@ -380,7 +387,6 @@ def batch_start_job(
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
extract_way: str = "text",
apply_pdf2html: bool = False,
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
special_doc_id_list: list = None,
re_run_extract_data: bool = False,
@ -418,8 +424,8 @@ def batch_start_job(
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_child_folder,
output_mapping_folder=output_mapping_child_folder,
doc_source=doc_source,
extract_way=extract_way,
apply_pdf2html=apply_pdf2html,
drilldown_folder=drilldown_folder,
re_run_extract_data=re_run_extract_data,
re_run_mapping_data=re_run_mapping_data,
@ -531,6 +537,7 @@ def only_output_mapping_data_in_db(mapping_data: pd.DataFrame) -> None:
def batch_filter_pdf_files(
pdf_folder: str,
doc_source: str = "emea_ar",
doc_data_excel_file: str = None,
output_folder: str = r"/data/emea_ar/output/filter_pages/",
special_doc_id_list: list = None,
@ -556,7 +563,7 @@ def batch_filter_pdf_files(
if doc_list is not None and doc_id not in doc_list:
continue
doc_datapoint_page_info, doc_result_details = filter_pages(
doc_id=doc_id, pdf_folder=pdf_folder
doc_id=doc_id, pdf_folder=pdf_folder, doc_source=doc_source
)
result_list.append(doc_datapoint_page_info)
result_details.extend(doc_result_details)
@ -631,7 +638,7 @@ def test_auto_generate_instructions():
datapoint_list.remove("doc_id")
data_extraction = DataExtraction(
doc_id, pdf_file, page_text_dict, datapoint_page_info, document_mapping_info_df
"emear_ar", doc_id, pdf_file, page_text_dict, datapoint_page_info, document_mapping_info_df
)
page_index_list = list(page_text_dict.keys())
if len(page_index_list) > 0:
@ -898,15 +905,15 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
def batch_run_documents(special_doc_id_list: list = None,
def batch_run_documents(doc_source: str = "emea_ar",
special_doc_id_list: list = None,
pdf_folder:str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_child_folder:str = r"/data/emea_ar/output/extract_data/docs/",
output_extract_data_total_folder:str = r"/data/emea_ar/output/extract_data/total/",
output_mapping_child_folder:str = r"/data/emea_ar/output/mapping_data/docs/",
output_mapping_total_folder:str = r"/data/emea_ar/output/mapping_data/total/",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
apply_pdf2html: bool = False):
drilldown_folder: str = r"/data/emea_ar/output/drilldown/"):
sample_document_list_folder = r'./sample_documents/'
document_list_files = glob(sample_document_list_folder + "*.txt")
page_filter_ground_truth_file = (
@ -914,7 +921,7 @@ def batch_run_documents(special_doc_id_list: list = None,
)
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
force_save_total_data = False
calculate_metrics = False
extract_way = "text"
@ -932,6 +939,7 @@ def batch_run_documents(special_doc_id_list: list = None,
doc_id_list = f.readlines()
doc_id_list = [doc_id.strip() for doc_id in doc_id_list]
batch_start_job(
doc_source,
pdf_folder,
output_pdf_text_folder,
page_filter_ground_truth_file,
@ -940,7 +948,6 @@ def batch_run_documents(special_doc_id_list: list = None,
output_extract_data_total_folder,
output_mapping_total_folder,
extract_way,
apply_pdf2html,
drilldown_folder,
doc_id_list,
re_run_extract_data,
@ -951,6 +958,7 @@ def batch_run_documents(special_doc_id_list: list = None,
)
else:
batch_start_job(
doc_source,
pdf_folder,
output_pdf_text_folder,
page_filter_ground_truth_file,
@ -959,7 +967,6 @@ def batch_run_documents(special_doc_id_list: list = None,
output_extract_data_total_folder,
output_mapping_total_folder,
extract_way,
apply_pdf2html,
drilldown_folder,
special_doc_id_list,
re_run_extract_data,
@ -971,6 +978,7 @@ def batch_run_documents(special_doc_id_list: list = None,
def batch_initial_document(sample_document_list_folder: str = r'./sample_documents/',
document_list_file: str = "sample_document_complex.txt",
doc_source: str = "emea_ar",
pdf_folder: str = r"/data/emea_ar/pdf/",
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/"):
@ -981,6 +989,7 @@ def batch_initial_document(sample_document_list_folder: str = r'./sample_documen
for doc_id in tqdm(doc_id_list):
logger.info(f"Start to initial document: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_extract_data_folder=output_extract_data_child_folder,
output_mapping_data_folder=output_mapping_child_folder)
@ -1154,6 +1163,8 @@ if __name__ == "__main__":
# special_doc_id_list = ["553242411"]
doc_source = "aus_prospectus"
if doc_source == "aus_prospectus":
special_doc_id_list: list = ["539790009",
"542300403",
"542301117",
@ -1165,7 +1176,7 @@ if __name__ == "__main__":
"554851189",
"555377021",
"555654388"]
# special_doc_id_list: list = ["542301117"]
special_doc_id_list: list = ["554851189"]
pdf_folder:str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder:str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder:str = r"/data/aus_prospectus/output/extract_data/docs/"
@ -1173,8 +1184,10 @@ if __name__ == "__main__":
output_mapping_child_folder:str = r"/data/aus_prospectus/output/mapping_data/docs/"
output_mapping_total_folder:str = r"/data/aus_prospectus/output/mapping_data/total/"
drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
apply_pdf2html = True
batch_run_documents(special_doc_id_list=special_doc_id_list,
elif doc_source == "emea_ar":
special_doc_id_list = ["553242411"]
batch_run_documents(doc_source=doc_source,
special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_child_folder=output_extract_data_child_folder,
@ -1182,7 +1195,6 @@ if __name__ == "__main__":
output_mapping_child_folder=output_mapping_child_folder,
output_mapping_total_folder=output_mapping_total_folder,
drilldown_folder=drilldown_folder,
apply_pdf2html=apply_pdf2html
)
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"

View File

@ -22,6 +22,7 @@ def test_validate_extraction_data():
datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
data_extraction = DataExtraction(
doc_source="emea_ar",
doc_id=document_id,
pdf_file=pdf_file,
output_data_folder=output_extract_data_child_folder,