dc-ml-emea-ar/app_emea_ar.py

98 lines
3.7 KiB
Python
Raw Normal View History

from flask import Flask, request, jsonify, render_template
from flasgger import Swagger, swag_from
from main import EMEA_AR_Parsing
from utils.logger import logger
from utils.biz_utils import clean_folder
from tqdm import tqdm
import pandas as pd
import os
template = {
"info": {
"title": "EMEA AR Data Extraction API",
"description": 'EMEA AR Data Extraction API',
"version": "1.0"
}
}
app = Flask(__name__)
# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
swagger = Swagger(app, template=template)
@app.route('/automation/api/model/emea_ar', methods=['POST'])
@swag_from('yml/emea_ar.yml')
2025-01-17 02:34:43 +00:00
def emea_ar_data_extract():
"""
Extract EMEA AR cost data from EMEA LUX PDF document
input sample:
{
"doc_id": "501380553"
}
output: EMEA AR cost data as a list of dictionaries
:return:
:rtype:
"""
logger.info('EMEA AR data extraction begin')
doc_id = request.json.get('doc_id')
if not doc_id:
return jsonify({"error": "doc_id is required"}), 400
pdf_folder = r"./data/emea_ar/pdf/"
2025-01-17 02:34:43 +00:00
output_pdf_text_folder = r"./data/emea_ar/output/pdf_text/"
output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/"
output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/"
drilldown_folder = r"./data/emea_ar/output/drilldown/"
db_mapping_document_folder = r"./data/emea_ar/output/db_mapping/document/"
db_mapping_provider_folder = r"./data/emea_ar/output/db_mapping/provider/"
extract_way = "text"
os.makedirs(pdf_folder, exist_ok=True)
os.makedirs(output_pdf_text_folder, exist_ok=True)
os.makedirs(output_extract_data_folder, exist_ok=True)
os.makedirs(output_mapping_data_folder, exist_ok=True)
os.makedirs(drilldown_folder, exist_ok=True)
os.makedirs(db_mapping_document_folder, exist_ok=True)
os.makedirs(db_mapping_provider_folder, exist_ok=True)
clean_folder(pdf_folder)
clean_folder(output_pdf_text_folder)
clean_folder(output_extract_data_folder)
clean_folder(output_mapping_data_folder)
clean_folder(drilldown_folder)
clean_folder(db_mapping_document_folder)
clean_folder(db_mapping_provider_folder)
re_run_extract_data = False
re_run_mapping_data = False
try:
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
2025-01-16 19:54:45 +00:00
doc_source="emea_ar",
pdf_folder=pdf_folder,
2025-01-17 02:34:43 +00:00
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_data_folder,
extract_way=extract_way,
drilldown_folder=drilldown_folder,
compare_with_provider=False)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
doc_mapping_data = emea_ar_parsing.mapping_data(
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
)
results = {"extract_data": doc_mapping_data,
"annotation_data": annotation_list}
return jsonify(results)
except Exception as e:
logger.error(f"Error: {e}")
results = {"extract_data": [],
"annotation_data": [],
"error": str(e)}
return jsonify(results)
if __name__ == '__main__':
# Add use_reloader = False to avoid init twice
app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)