2024-12-06 22:31:42 +00:00
|
|
|
import os
|
|
|
|
|
import json
|
|
|
|
|
import pandas as pd
|
|
|
|
|
from glob import glob
|
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
from utils.logger import logger
|
|
|
|
|
from utils.sql_query_util import query_document_fund_mapping
|
|
|
|
|
from core.page_filter import FilterPages
|
|
|
|
|
from core.data_extraction import DataExtraction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_validate_extraction_data():
|
|
|
|
|
document_id = "481482392"
|
|
|
|
|
pdf_file = f"/data/emea_ar/pdf/481482392.pdf"
|
|
|
|
|
output_extract_data_child_folder = r"/data/emea_ar/output/extract_data/docs/"
|
|
|
|
|
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
|
|
|
|
|
document_mapping_info_df = query_document_fund_mapping(document_id, rerun=False)
|
|
|
|
|
filter_pages = FilterPages(
|
|
|
|
|
document_id, pdf_file, document_mapping_info_df
|
|
|
|
|
)
|
|
|
|
|
page_text_dict = filter_pages.page_text_dict
|
|
|
|
|
datapoint_page_info, result_details = get_datapoint_page_info(filter_pages)
|
|
|
|
|
datapoints = get_datapoints_from_datapoint_page_info(datapoint_page_info)
|
|
|
|
|
data_extraction = DataExtraction(
|
2025-01-16 17:17:48 +00:00
|
|
|
doc_source="emea_ar",
|
2024-12-06 22:31:42 +00:00
|
|
|
doc_id=document_id,
|
|
|
|
|
pdf_file=pdf_file,
|
|
|
|
|
output_data_folder=output_extract_data_child_folder,
|
|
|
|
|
page_text_dict=page_text_dict,
|
|
|
|
|
datapoint_page_info=datapoint_page_info,
|
|
|
|
|
datapoints=datapoints,
|
|
|
|
|
document_mapping_info_df=document_mapping_info_df,
|
|
|
|
|
extract_way="text",
|
|
|
|
|
output_image_folder=None
|
|
|
|
|
)
|
|
|
|
|
output_data_json_folder = os.path.join(
|
|
|
|
|
r"/data/emea_ar/output/extract_data/docs/by_text/", "json/"
|
|
|
|
|
)
|
|
|
|
|
os.makedirs(output_data_json_folder, exist_ok=True)
|
|
|
|
|
json_file = os.path.join(output_data_json_folder, f"{document_id}.json")
|
|
|
|
|
data_from_gpt = None
|
|
|
|
|
if os.path.exists(json_file):
|
|
|
|
|
logger.info(
|
|
|
|
|
f"The document: {document_id} has been parsed, loading data from {json_file}"
|
|
|
|
|
)
|
|
|
|
|
with open(json_file, "r", encoding="utf-8") as f:
|
|
|
|
|
data_from_gpt = json.load(f)
|
|
|
|
|
for extract_data in data_from_gpt:
|
|
|
|
|
page_index = extract_data["page_index"]
|
|
|
|
|
if page_index == 451:
|
|
|
|
|
logger.info(f"Page index: {page_index}")
|
|
|
|
|
raw_answer = extract_data["raw_answer"]
|
|
|
|
|
raw_answer_json = json.loads(raw_answer)
|
|
|
|
|
extract_data_info = data_extraction.validate_data(raw_answer_json)
|
|
|
|
|
print(extract_data_info)
|
|
|
|
|
|
|
|
|
|
def get_datapoint_page_info(filter_pages) -> tuple:
|
|
|
|
|
datapoint_page_info, result_details = filter_pages.start_job()
|
|
|
|
|
return datapoint_page_info, result_details
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_datapoints_from_datapoint_page_info(datapoint_page_info) -> list:
|
|
|
|
|
datapoints = list(datapoint_page_info.keys())
|
|
|
|
|
if "doc_id" in datapoints:
|
|
|
|
|
datapoints.remove("doc_id")
|
|
|
|
|
return datapoints
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
|
test_validate_extraction_data()
|