From a09778d9d1b84e465aa75ee22f44963fb457dbeb Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 26 Nov 2024 11:24:29 -0600 Subject: [PATCH] Create EMEA AR API code file. Optimize annotation list for drilldown. --- .gitignore | 1 + app_emea_ar.py | 78 ++++++++++++++++++++++++++++++++++++++++++++++ main.py | 43 ++++++++++++++----------- utils/biz_utils.py | 20 +++++++++++- utils/pdf_util.py | 5 ++- yml/emea_ar.yml | 27 ++++++++++++++++ 6 files changed, 154 insertions(+), 20 deletions(-) create mode 100644 app_emea_ar.py create mode 100644 yml/emea_ar.yml diff --git a/.gitignore b/.gitignore index 972b63d..d1f36cc 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ /core/__pycache__/*.pyc /test_calc_metrics.py /test_metrics +/data diff --git a/app_emea_ar.py b/app_emea_ar.py new file mode 100644 index 0000000..f51929d --- /dev/null +++ b/app_emea_ar.py @@ -0,0 +1,78 @@ +from flask import Flask, request, jsonify, render_template +from flasgger import Swagger, swag_from +from main import EMEA_AR_Parsing +from utils.logger import logger +from utils.biz_utils import clean_folder +from tqdm import tqdm +import pandas as pd +import os + + +template = { + "info": { + "title": "EMEA AR Data Extraction API", + "description": 'EMEA AR Data Extraction API', + "version": "1.0" + } +} +app = Flask(__name__) +# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/ +swagger = Swagger(app, template=template) + + +@app.route('/automation/api/model/emea_ar', methods=['POST']) +@swag_from('yml/emea_ar.yml') +def us_ar_data_extract(): + """ + Extract EMEA AR cost data from EMEA LUX PDF document + input sample: + { + "doc_id": "501380553" + } + output: EMEA AR cost data as a list of dictionaries + :return: + :rtype: + """ + logger.info('EMEA AR data extraction begin') + doc_id = request.json.get('doc_id') + + if not doc_id: + return jsonify({"error": "doc_id is required"}), 400 + + pdf_folder = r"./data/emea_ar/pdf/" + output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" + output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" + drilldown_folder = r"./data/emea_ar/output/drilldown/" + extract_way = "text" + + os.makedirs(pdf_folder, exist_ok=True) + os.makedirs(output_extract_data_folder, exist_ok=True) + os.makedirs(output_mapping_data_folder, exist_ok=True) + os.makedirs(drilldown_folder, exist_ok=True) + + clean_folder(pdf_folder) + clean_folder(output_extract_data_folder) + clean_folder(output_mapping_data_folder) + clean_folder(drilldown_folder) + + re_run_extract_data = False + re_run_mapping_data = False + + emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, + pdf_folder=pdf_folder, + output_extract_data_folder=output_extract_data_folder, + output_mapping_data_folder=output_mapping_data_folder, + extract_way=extract_way, + drilldown_folder=drilldown_folder) + doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) + doc_mapping_data = emea_ar_parsing.mapping_data( + data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data + ) + results = {"extract_data": doc_mapping_data, + "annotation_data": annotation_list} + return jsonify(results) + + +if __name__ == '__main__': + # Add use_reloader = False to avoid init twice + app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False) diff --git a/main.py b/main.py index fa8be69..8ee0a7a 100644 --- a/main.py +++ b/main.py @@ -131,8 +131,8 @@ class EMEA_AR_Parsing: data_from_gpt = {"data": []} # Drilldown data to relevant PDF document - self.drilldown_pdf_document(data_from_gpt) - return data_from_gpt + annotation_list = self.drilldown_pdf_document(data_from_gpt) + return data_from_gpt, annotation_list def drilldown_pdf_document(self, data_from_gpt: list) -> list: logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}") @@ -180,25 +180,32 @@ class EMEA_AR_Parsing: drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list, output_pdf_folder=self.drilldown_folder) + annotation_list = [] if len(drilldown_result) > 0: - logger.info(f"Drilldown PDF document for doc_id: {self.doc_id} successfully") - for drilldown_data in drilldown_result: - drilldown_data["doc_id"] = self.doc_id + logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully") + annotation_list = drilldown_result.get("annotation_list", []) + for annotation in annotation_list: + annotation["doc_id"] = doc_id if self.drilldown_folder is not None and len(self.drilldown_folder) > 0: drilldown_data_folder = os.path.join(self.drilldown_folder, "data/") os.makedirs(drilldown_data_folder, exist_ok=True) - drilldown_file = os.path.join(drilldown_data_folder, f"{self.doc_id}_drilldown.xlsx") + drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx") drilldown_source_df = pd.DataFrame(drilldown_data_list) - drilldown_result_df = pd.DataFrame(drilldown_result) + annotation_list_df = pd.DataFrame(annotation_list) # set drilldown_result_df column order as doc_id, pdf_file, page_index, # data_point, value, matching_val_area, normalized_bbox - drilldown_result_df = drilldown_result_df[["doc_id", "pdf_file", "page_index", - "data_point", "value", "matching_val_area", "normalized_bbox"]] - with pd.ExcelWriter(drilldown_file) as writer: - drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data") - drilldown_result_df.to_excel(writer, index=False, sheet_name="drilldown_data") - + annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index", + "data_point", "value", "matching_val_area", "normalized_bbox"]] + logger.info(f"Writing drilldown data to {drilldown_file}") + try: + with pd.ExcelWriter(drilldown_file) as writer: + drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data") + annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data") + except Exception as e: + logger.error(f"Error: {e}") + annotation_list = annotation_list_df.to_dict(orient="records") + return annotation_list def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: if not re_run: @@ -252,8 +259,8 @@ def extract_data( output_extract_data_folder=output_data_folder, extract_way=extract_way, ) - data_from_gpt = emea_ar_parsing.extract_data(re_run) - return data_from_gpt + data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run) + return data_from_gpt, annotation_list def mapping_data( @@ -273,11 +280,11 @@ def mapping_data( output_mapping_data_folder=output_mapping_folder, extract_way=extract_way, ) - doc_data_from_gpt = emea_ar_parsing.extract_data(re_run=re_run_extract_data) + doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) doc_mapping_data = emea_ar_parsing.mapping_data( data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data ) - return doc_data_from_gpt, doc_mapping_data + return doc_data_from_gpt, annotation_list, doc_mapping_data def batch_extract_data( @@ -369,7 +376,7 @@ def batch_start_job( result_extract_data_list = [] result_mapping_data_list = [] for doc_id in tqdm(doc_list): - doc_data_from_gpt, doc_mapping_data_list = mapping_data( + doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data( doc_id=doc_id, pdf_folder=pdf_folder, output_extract_data_folder=output_extract_data_child_folder, diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 9e75d6d..acd5d09 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -1,4 +1,6 @@ import re +import os +import time from utils.logger import logger from copy import deepcopy from traceback import print_exc @@ -919,4 +921,20 @@ def split_short_name_with_share_features(text: str): new_text = currency_text + ' ' + text + ' ' + feature_name new_text = re.sub(r'\s+', ' ', new_text).strip() - return new_text \ No newline at end of file + return new_text + + +def clean_folder(folder_path: str, expired_days: int = 5): + if not os.path.exists(folder_path): + return + for root, dirs, files in os.walk(folder_path): + for file in files: + file_path = os.path.join(root, file) + if os.path.exists(file_path): + file_time = os.path.getmtime(file_path) + current_time = time.time() + if (current_time - file_time) / (60 * 60 * 24) > expired_days: + try: + os.remove(file_path) + except: + pass \ No newline at end of file diff --git a/utils/pdf_util.py b/utils/pdf_util.py index 698c133..6312d0b 100644 --- a/utils/pdf_util.py +++ b/utils/pdf_util.py @@ -390,7 +390,8 @@ class PDFUtil: "page_index": page_index, "data_point": data_point, "value": highlight_value, - "matching_val_area": []} + "matching_val_area": [], + "normalized_bbox": []} if highlight_value_search_text is not None: content = { "data_point": data_point, @@ -432,6 +433,8 @@ class PDFUtil: annotation_data["matching_val_area"] = bbox_list if len(bbox_list) > 0: annotation_data["normalized_bbox"] = self.get_bbox_normalized(page, bbox_list) + else: + annotation_data["normalized_bbox"] = [] return annotation_data def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True): diff --git a/yml/emea_ar.yml b/yml/emea_ar.yml new file mode 100644 index 0000000..75ea573 --- /dev/null +++ b/yml/emea_ar.yml @@ -0,0 +1,27 @@ +Example to extract data from EMEA AR PDF Document. +Sample: + { + "doc_id": "501380553" + } +Author: Blade He +--- +parameters: + - name: EMEA AR Document Id + in: body + type: string + required: true + description: Example to extract data from EMEA AR PDF Document. + default: {"doc_id": "501380553"} + schema: + required: + - Document Id + properties: + doc_id: + description: EMEA AR Document Id + required: true + type: string +responses: + 200: + description: succesfully. + 400: + description: failed. \ No newline at end of file