97 lines
3.8 KiB
Python
97 lines
3.8 KiB
Python
|
|
from flask import Flask, request, jsonify, render_template
|
||
|
|
from flasgger import Swagger, swag_from
|
||
|
|
from main import EMEA_AR_Parsing
|
||
|
|
from utils.logger import logger
|
||
|
|
from utils.biz_utils import clean_folder
|
||
|
|
from tqdm import tqdm
|
||
|
|
import pandas as pd
|
||
|
|
import os
|
||
|
|
|
||
|
|
|
||
|
|
template = {
|
||
|
|
"info": {
|
||
|
|
"title": "Australia Prospectus Data Extraction API",
|
||
|
|
"description": 'Australia Prospectus Data Extraction API',
|
||
|
|
"version": "1.0"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
app = Flask(__name__)
|
||
|
|
# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
|
||
|
|
swagger = Swagger(app, template=template)
|
||
|
|
|
||
|
|
|
||
|
|
@app.route('/automation/api/model/aus_prospectus', methods=['POST'])
|
||
|
|
@swag_from('yml/aus_prospectus.yml')
|
||
|
|
def aus_prospectus_data_extract():
|
||
|
|
"""
|
||
|
|
Extract Australia Prospectus data from Australia Prospectus PDF document
|
||
|
|
input sample:
|
||
|
|
{
|
||
|
|
"doc_id": "412778803"
|
||
|
|
}
|
||
|
|
output: Australia Prospectus cost data as a list of dictionaries
|
||
|
|
:return:
|
||
|
|
:rtype:
|
||
|
|
"""
|
||
|
|
logger.info('Australia Prospectus data extraction begin')
|
||
|
|
doc_id = request.json.get('doc_id')
|
||
|
|
|
||
|
|
if not doc_id:
|
||
|
|
return jsonify({"error": "doc_id is required"}), 400
|
||
|
|
|
||
|
|
pdf_folder = r"./data/aus_prospectus/pdf/"
|
||
|
|
output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/"
|
||
|
|
output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/"
|
||
|
|
output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/"
|
||
|
|
drilldown_folder = r"./data/aus_prospectus/output/drilldown/"
|
||
|
|
db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/"
|
||
|
|
db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/"
|
||
|
|
extract_way = "text"
|
||
|
|
|
||
|
|
os.makedirs(pdf_folder, exist_ok=True)
|
||
|
|
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
||
|
|
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||
|
|
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||
|
|
os.makedirs(drilldown_folder, exist_ok=True)
|
||
|
|
os.makedirs(db_mapping_document_folder, exist_ok=True)
|
||
|
|
os.makedirs(db_mapping_provider_folder, exist_ok=True)
|
||
|
|
|
||
|
|
clean_folder(pdf_folder)
|
||
|
|
clean_folder(output_pdf_text_folder)
|
||
|
|
clean_folder(output_extract_data_folder)
|
||
|
|
clean_folder(output_mapping_data_folder)
|
||
|
|
clean_folder(drilldown_folder)
|
||
|
|
clean_folder(db_mapping_document_folder)
|
||
|
|
clean_folder(db_mapping_provider_folder)
|
||
|
|
|
||
|
|
re_run_extract_data = False
|
||
|
|
re_run_mapping_data = False
|
||
|
|
|
||
|
|
try:
|
||
|
|
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||
|
|
doc_source="aus_prospectus",
|
||
|
|
pdf_folder=pdf_folder,
|
||
|
|
output_pdf_text_folder=output_pdf_text_folder,
|
||
|
|
output_extract_data_folder=output_extract_data_folder,
|
||
|
|
output_mapping_data_folder=output_mapping_data_folder,
|
||
|
|
extract_way=extract_way,
|
||
|
|
drilldown_folder=drilldown_folder,
|
||
|
|
compare_with_provider=False)
|
||
|
|
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||
|
|
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||
|
|
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||
|
|
)
|
||
|
|
results = {"extract_data": doc_mapping_data}
|
||
|
|
return jsonify(results)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error(f"Error: {e}")
|
||
|
|
results = {"extract_data": [],
|
||
|
|
"annotation_data": [],
|
||
|
|
"error": str(e)}
|
||
|
|
return jsonify(results)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == '__main__':
|
||
|
|
# Add use_reloader = False to avoid init twice
|
||
|
|
app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)
|