from flask import Flask, request, jsonify, render_template from flasgger import Swagger, swag_from from main import EMEA_AR_Parsing from utils.logger import logger from utils.biz_utils import clean_folder from tqdm import tqdm import pandas as pd import os template = { "info": { "title": "EMEA AR Data Extraction API", "description": 'EMEA AR Data Extraction API', "version": "1.0" } } app = Flask(__name__) # By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/ swagger = Swagger(app, template=template) @app.route('/automation/api/model/emea_ar', methods=['POST']) @swag_from('yml/emea_ar.yml') def us_ar_data_extract(): """ Extract EMEA AR cost data from EMEA LUX PDF document input sample: { "doc_id": "501380553" } output: EMEA AR cost data as a list of dictionaries :return: :rtype: """ logger.info('EMEA AR data extraction begin') doc_id = request.json.get('doc_id') if not doc_id: return jsonify({"error": "doc_id is required"}), 400 pdf_folder = r"./data/emea_ar/pdf/" output_extract_data_folder = r"./data/emea_ar/output/extract_data/docs/" output_mapping_data_folder = r"./data/emea_ar/output/mapping_data/docs/" drilldown_folder = r"./data/emea_ar/output/drilldown/" extract_way = "text" os.makedirs(pdf_folder, exist_ok=True) os.makedirs(output_extract_data_folder, exist_ok=True) os.makedirs(output_mapping_data_folder, exist_ok=True) os.makedirs(drilldown_folder, exist_ok=True) clean_folder(pdf_folder) clean_folder(output_extract_data_folder) clean_folder(output_mapping_data_folder) clean_folder(drilldown_folder) re_run_extract_data = False re_run_mapping_data = False try: emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, pdf_folder=pdf_folder, output_extract_data_folder=output_extract_data_folder, output_mapping_data_folder=output_mapping_data_folder, extract_way=extract_way, drilldown_folder=drilldown_folder) doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) doc_mapping_data = emea_ar_parsing.mapping_data( data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data ) results = {"extract_data": doc_mapping_data, "annotation_data": annotation_list} return jsonify(results) except Exception as e: logger.error(f"Error: {e}") results = {"extract_data": [], "annotation_data": [], "error": str(e)} return jsonify(results) if __name__ == '__main__': # Add use_reloader = False to avoid init twice app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)