Create EMEA AR API code file.
Optimize annotation list for drilldown.
This commit is contained in:
parent
fb356fce76
commit
a09778d9d1
|
|
@ -5,3 +5,4 @@
|
||||||
/core/__pycache__/*.pyc
|
/core/__pycache__/*.pyc
|
||||||
/test_calc_metrics.py
|
/test_calc_metrics.py
|
||||||
/test_metrics
|
/test_metrics
|
||||||
|
/data
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,78 @@
|
||||||
|
from flask import Flask, request, jsonify, render_template
|
||||||
|
from flasgger import Swagger, swag_from
|
||||||
|
from main import EMEA_AR_Parsing
|
||||||
|
from utils.logger import logger
|
||||||
|
from utils.biz_utils import clean_folder
|
||||||
|
from tqdm import tqdm
|
||||||
|
import pandas as pd
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
template = {
|
||||||
|
"info": {
|
||||||
|
"title": "EMEA AR Data Extraction API",
|
||||||
|
"description": 'EMEA AR Data Extraction API',
|
||||||
|
"version": "1.0"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
app = Flask(__name__)
|
||||||
|
# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
|
||||||
|
swagger = Swagger(app, template=template)
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/automation/api/model/emea_ar', methods=['POST'])
|
||||||
|
@swag_from('yml/emea_ar.yml')
|
||||||
|
def us_ar_data_extract():
|
||||||
|
"""
|
||||||
|
Extract EMEA AR cost data from EMEA LUX PDF document
|
||||||
|
input sample:
|
||||||
|
{
|
||||||
|
"doc_id": "501380553"
|
||||||
|
}
|
||||||
|
output: EMEA AR cost data as a list of dictionaries
|
||||||
|
:return:
|
||||||
|
:rtype:
|
||||||
|
"""
|
||||||
|
logger.info('EMEA AR data extraction begin')
|
||||||
|
doc_id = request.json.get('doc_id')
|
||||||
|
|
||||||
|
if not doc_id:
|
||||||
|
return jsonify({"error": "doc_id is required"}), 400
|
||||||
|
|
||||||
|
pdf_folder = r"./data/emea_ar/pdf/"
|
||||||
|
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
|
||||||
|
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
|
||||||
|
drilldown_folder = r"./data/emea_ar/output/drilldown/"
|
||||||
|
extract_way = "text"
|
||||||
|
|
||||||
|
os.makedirs(pdf_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||||||
|
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||||||
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||||||
|
|
||||||
|
clean_folder(pdf_folder)
|
||||||
|
clean_folder(output_extract_data_folder)
|
||||||
|
clean_folder(output_mapping_data_folder)
|
||||||
|
clean_folder(drilldown_folder)
|
||||||
|
|
||||||
|
re_run_extract_data = False
|
||||||
|
re_run_mapping_data = False
|
||||||
|
|
||||||
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||||
|
pdf_folder=pdf_folder,
|
||||||
|
output_extract_data_folder=output_extract_data_folder,
|
||||||
|
output_mapping_data_folder=output_mapping_data_folder,
|
||||||
|
extract_way=extract_way,
|
||||||
|
drilldown_folder=drilldown_folder)
|
||||||
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||||
|
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||||||
|
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||||||
|
)
|
||||||
|
results = {"extract_data": doc_mapping_data,
|
||||||
|
"annotation_data": annotation_list}
|
||||||
|
return jsonify(results)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
# Add use_reloader = False to avoid init twice
|
||||||
|
app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)
|
||||||
43
main.py
43
main.py
|
|
@ -131,8 +131,8 @@ class EMEA_AR_Parsing:
|
||||||
data_from_gpt = {"data": []}
|
data_from_gpt = {"data": []}
|
||||||
|
|
||||||
# Drilldown data to relevant PDF document
|
# Drilldown data to relevant PDF document
|
||||||
self.drilldown_pdf_document(data_from_gpt)
|
annotation_list = self.drilldown_pdf_document(data_from_gpt)
|
||||||
return data_from_gpt
|
return data_from_gpt, annotation_list
|
||||||
|
|
||||||
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
|
||||||
logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}")
|
logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}")
|
||||||
|
|
@ -180,25 +180,32 @@ class EMEA_AR_Parsing:
|
||||||
|
|
||||||
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
drilldown_result = pdf_util.batch_drilldown(drilldown_data_list=drilldown_data_list,
|
||||||
output_pdf_folder=self.drilldown_folder)
|
output_pdf_folder=self.drilldown_folder)
|
||||||
|
annotation_list = []
|
||||||
if len(drilldown_result) > 0:
|
if len(drilldown_result) > 0:
|
||||||
logger.info(f"Drilldown PDF document for doc_id: {self.doc_id} successfully")
|
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
|
||||||
for drilldown_data in drilldown_result:
|
annotation_list = drilldown_result.get("annotation_list", [])
|
||||||
drilldown_data["doc_id"] = self.doc_id
|
for annotation in annotation_list:
|
||||||
|
annotation["doc_id"] = doc_id
|
||||||
if self.drilldown_folder is not None and len(self.drilldown_folder) > 0:
|
if self.drilldown_folder is not None and len(self.drilldown_folder) > 0:
|
||||||
drilldown_data_folder = os.path.join(self.drilldown_folder, "data/")
|
drilldown_data_folder = os.path.join(self.drilldown_folder, "data/")
|
||||||
os.makedirs(drilldown_data_folder, exist_ok=True)
|
os.makedirs(drilldown_data_folder, exist_ok=True)
|
||||||
drilldown_file = os.path.join(drilldown_data_folder, f"{self.doc_id}_drilldown.xlsx")
|
drilldown_file = os.path.join(drilldown_data_folder, f"{doc_id}_drilldown.xlsx")
|
||||||
|
|
||||||
drilldown_source_df = pd.DataFrame(drilldown_data_list)
|
drilldown_source_df = pd.DataFrame(drilldown_data_list)
|
||||||
drilldown_result_df = pd.DataFrame(drilldown_result)
|
annotation_list_df = pd.DataFrame(annotation_list)
|
||||||
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
|
||||||
# data_point, value, matching_val_area, normalized_bbox
|
# data_point, value, matching_val_area, normalized_bbox
|
||||||
drilldown_result_df = drilldown_result_df[["doc_id", "pdf_file", "page_index",
|
annotation_list_df = annotation_list_df[["doc_id", "pdf_file", "page_index",
|
||||||
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
"data_point", "value", "matching_val_area", "normalized_bbox"]]
|
||||||
with pd.ExcelWriter(drilldown_file) as writer:
|
logger.info(f"Writing drilldown data to {drilldown_file}")
|
||||||
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
|
try:
|
||||||
drilldown_result_df.to_excel(writer, index=False, sheet_name="drilldown_data")
|
with pd.ExcelWriter(drilldown_file) as writer:
|
||||||
|
drilldown_source_df.to_excel(writer, index=False, sheet_name="source_data")
|
||||||
|
annotation_list_df.to_excel(writer, index=False, sheet_name="drilldown_data")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
annotation_list = annotation_list_df.to_dict(orient="records")
|
||||||
|
return annotation_list
|
||||||
|
|
||||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||||
if not re_run:
|
if not re_run:
|
||||||
|
|
@ -252,8 +259,8 @@ def extract_data(
|
||||||
output_extract_data_folder=output_data_folder,
|
output_extract_data_folder=output_data_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
)
|
)
|
||||||
data_from_gpt = emea_ar_parsing.extract_data(re_run)
|
data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run)
|
||||||
return data_from_gpt
|
return data_from_gpt, annotation_list
|
||||||
|
|
||||||
|
|
||||||
def mapping_data(
|
def mapping_data(
|
||||||
|
|
@ -273,11 +280,11 @@ def mapping_data(
|
||||||
output_mapping_data_folder=output_mapping_folder,
|
output_mapping_data_folder=output_mapping_folder,
|
||||||
extract_way=extract_way,
|
extract_way=extract_way,
|
||||||
)
|
)
|
||||||
doc_data_from_gpt = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||||
doc_mapping_data = emea_ar_parsing.mapping_data(
|
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||||||
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||||||
)
|
)
|
||||||
return doc_data_from_gpt, doc_mapping_data
|
return doc_data_from_gpt, annotation_list, doc_mapping_data
|
||||||
|
|
||||||
|
|
||||||
def batch_extract_data(
|
def batch_extract_data(
|
||||||
|
|
@ -369,7 +376,7 @@ def batch_start_job(
|
||||||
result_extract_data_list = []
|
result_extract_data_list = []
|
||||||
result_mapping_data_list = []
|
result_mapping_data_list = []
|
||||||
for doc_id in tqdm(doc_list):
|
for doc_id in tqdm(doc_list):
|
||||||
doc_data_from_gpt, doc_mapping_data_list = mapping_data(
|
doc_data_from_gpt, annotation_list, doc_mapping_data_list = mapping_data(
|
||||||
doc_id=doc_id,
|
doc_id=doc_id,
|
||||||
pdf_folder=pdf_folder,
|
pdf_folder=pdf_folder,
|
||||||
output_extract_data_folder=output_extract_data_child_folder,
|
output_extract_data_folder=output_extract_data_child_folder,
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,6 @@
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
|
import time
|
||||||
from utils.logger import logger
|
from utils.logger import logger
|
||||||
from copy import deepcopy
|
from copy import deepcopy
|
||||||
from traceback import print_exc
|
from traceback import print_exc
|
||||||
|
|
@ -920,3 +922,19 @@ def split_short_name_with_share_features(text: str):
|
||||||
new_text = currency_text + ' ' + text + ' ' + feature_name
|
new_text = currency_text + ' ' + text + ' ' + feature_name
|
||||||
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
||||||
return new_text
|
return new_text
|
||||||
|
|
||||||
|
|
||||||
|
def clean_folder(folder_path: str, expired_days: int = 5):
|
||||||
|
if not os.path.exists(folder_path):
|
||||||
|
return
|
||||||
|
for root, dirs, files in os.walk(folder_path):
|
||||||
|
for file in files:
|
||||||
|
file_path = os.path.join(root, file)
|
||||||
|
if os.path.exists(file_path):
|
||||||
|
file_time = os.path.getmtime(file_path)
|
||||||
|
current_time = time.time()
|
||||||
|
if (current_time - file_time) / (60 * 60 * 24) > expired_days:
|
||||||
|
try:
|
||||||
|
os.remove(file_path)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
@ -390,7 +390,8 @@ class PDFUtil:
|
||||||
"page_index": page_index,
|
"page_index": page_index,
|
||||||
"data_point": data_point,
|
"data_point": data_point,
|
||||||
"value": highlight_value,
|
"value": highlight_value,
|
||||||
"matching_val_area": []}
|
"matching_val_area": [],
|
||||||
|
"normalized_bbox": []}
|
||||||
if highlight_value_search_text is not None:
|
if highlight_value_search_text is not None:
|
||||||
content = {
|
content = {
|
||||||
"data_point": data_point,
|
"data_point": data_point,
|
||||||
|
|
@ -432,6 +433,8 @@ class PDFUtil:
|
||||||
annotation_data["matching_val_area"] = bbox_list
|
annotation_data["matching_val_area"] = bbox_list
|
||||||
if len(bbox_list) > 0:
|
if len(bbox_list) > 0:
|
||||||
annotation_data["normalized_bbox"] = self.get_bbox_normalized(page, bbox_list)
|
annotation_data["normalized_bbox"] = self.get_bbox_normalized(page, bbox_list)
|
||||||
|
else:
|
||||||
|
annotation_data["normalized_bbox"] = []
|
||||||
return annotation_data
|
return annotation_data
|
||||||
|
|
||||||
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
|
def get_proper_search_text(self, raw_value: str, highlight_value_regex: str, page_text: str, ignore_case: bool = True):
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,27 @@
|
||||||
|
Example to extract data from EMEA AR PDF Document.
|
||||||
|
Sample:
|
||||||
|
{
|
||||||
|
"doc_id": "501380553"
|
||||||
|
}
|
||||||
|
Author: Blade He
|
||||||
|
---
|
||||||
|
parameters:
|
||||||
|
- name: EMEA AR Document Id
|
||||||
|
in: body
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Example to extract data from EMEA AR PDF Document.
|
||||||
|
default: {"doc_id": "501380553"}
|
||||||
|
schema:
|
||||||
|
required:
|
||||||
|
- Document Id
|
||||||
|
properties:
|
||||||
|
doc_id:
|
||||||
|
description: EMEA AR Document Id
|
||||||
|
required: true
|
||||||
|
type: string
|
||||||
|
responses:
|
||||||
|
200:
|
||||||
|
description: succesfully.
|
||||||
|
400:
|
||||||
|
description: failed.
|
||||||
Loading…
Reference in New Issue