Add API code file
This commit is contained in:
parent
e9f6383258
commit
b7506c78f3
|
|
@ -0,0 +1,96 @@
|
|||
from flask import Flask, request, jsonify, render_template
|
||||
from flasgger import Swagger, swag_from
|
||||
from main import EMEA_AR_Parsing
|
||||
from utils.logger import logger
|
||||
from utils.biz_utils import clean_folder
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
|
||||
template = {
|
||||
"info": {
|
||||
"title": "Australia Prospectus Data Extraction API",
|
||||
"description": 'Australia Prospectus Data Extraction API',
|
||||
"version": "1.0"
|
||||
}
|
||||
}
|
||||
app = Flask(__name__)
|
||||
# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
|
||||
swagger = Swagger(app, template=template)
|
||||
|
||||
|
||||
@app.route('/automation/api/model/aus_prospectus', methods=['POST'])
|
||||
@swag_from('yml/aus_prospectus.yml')
|
||||
def aus_prospectus_data_extract():
|
||||
"""
|
||||
Extract Australia Prospectus data from Australia Prospectus PDF document
|
||||
input sample:
|
||||
{
|
||||
"doc_id": "412778803"
|
||||
}
|
||||
output: Australia Prospectus cost data as a list of dictionaries
|
||||
:return:
|
||||
:rtype:
|
||||
"""
|
||||
logger.info('Australia Prospectus data extraction begin')
|
||||
doc_id = request.json.get('doc_id')
|
||||
|
||||
if not doc_id:
|
||||
return jsonify({"error": "doc_id is required"}), 400
|
||||
|
||||
pdf_folder = r"./data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/"
|
||||
output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/"
|
||||
drilldown_folder = r"./data/aus_prospectus/output/drilldown/"
|
||||
db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/"
|
||||
db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/"
|
||||
extract_way = "text"
|
||||
|
||||
os.makedirs(pdf_folder, exist_ok=True)
|
||||
os.makedirs(output_pdf_text_folder, exist_ok=True)
|
||||
os.makedirs(output_extract_data_folder, exist_ok=True)
|
||||
os.makedirs(output_mapping_data_folder, exist_ok=True)
|
||||
os.makedirs(drilldown_folder, exist_ok=True)
|
||||
os.makedirs(db_mapping_document_folder, exist_ok=True)
|
||||
os.makedirs(db_mapping_provider_folder, exist_ok=True)
|
||||
|
||||
clean_folder(pdf_folder)
|
||||
clean_folder(output_pdf_text_folder)
|
||||
clean_folder(output_extract_data_folder)
|
||||
clean_folder(output_mapping_data_folder)
|
||||
clean_folder(drilldown_folder)
|
||||
clean_folder(db_mapping_document_folder)
|
||||
clean_folder(db_mapping_provider_folder)
|
||||
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
|
||||
try:
|
||||
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
|
||||
doc_source="aus_prospectus",
|
||||
pdf_folder=pdf_folder,
|
||||
output_pdf_text_folder=output_pdf_text_folder,
|
||||
output_extract_data_folder=output_extract_data_folder,
|
||||
output_mapping_data_folder=output_mapping_data_folder,
|
||||
extract_way=extract_way,
|
||||
drilldown_folder=drilldown_folder,
|
||||
compare_with_provider=False)
|
||||
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
|
||||
doc_mapping_data = emea_ar_parsing.mapping_data(
|
||||
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
|
||||
)
|
||||
results = {"extract_data": doc_mapping_data}
|
||||
return jsonify(results)
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
results = {"extract_data": [],
|
||||
"annotation_data": [],
|
||||
"error": str(e)}
|
||||
return jsonify(results)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Add use_reloader = False to avoid init twice
|
||||
app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)
|
||||
|
|
@ -601,7 +601,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
|||
# ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
|
||||
# verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
|
||||
verify_fields = [
|
||||
"DocumentId",
|
||||
"doc_id",
|
||||
"raw_fund_name",
|
||||
"fund_id",
|
||||
"fund_name",
|
||||
|
|
@ -629,7 +629,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
|
|||
# verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])]
|
||||
verify_data_df = verify_data_df[verify_fields]
|
||||
verify_data_df = verify_data_df.drop_duplicates()
|
||||
verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"})
|
||||
# verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"})
|
||||
verify_data_df.fillna("", inplace=True)
|
||||
verify_data_df.reset_index(drop=True, inplace=True)
|
||||
|
||||
|
|
|
|||
|
|
@ -11,5 +11,5 @@
|
|||
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
|
||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
|
||||
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
||||
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
||||
"change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
||||
}
|
||||
|
|
@ -11,5 +11,5 @@
|
|||
"minimum_initial_investment": "fund_level",
|
||||
"indirect_costs": "share_level",
|
||||
"recoverable_expenses": "share_level",
|
||||
"change_recoverable_expanses": "share_level"
|
||||
"change_recoverable_expenses": "share_level"
|
||||
}
|
||||
|
|
@ -11,5 +11,5 @@
|
|||
"benchmark_name": "benchmark name",
|
||||
"indirect_costs": "indirect cost",
|
||||
"recoverable_expenses": "recoverable expenses",
|
||||
"change_recoverable_expanses": "change recoverable expanses"
|
||||
"change_recoverable_expenses": "change recoverable expanses"
|
||||
}
|
||||
|
|
@ -11,5 +11,5 @@
|
|||
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
|
||||
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
|
||||
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
|
||||
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
||||
"change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
|
||||
}
|
||||
|
|
@ -11,5 +11,5 @@
|
|||
"minimum_initial_investment": "integer",
|
||||
"indirect_costs": "float",
|
||||
"recoverable_expenses": "float",
|
||||
"change_recoverable_expanses": "float"
|
||||
"change_recoverable_expenses": "float"
|
||||
}
|
||||
|
|
@ -32,7 +32,7 @@
|
|||
"benchmark_name",
|
||||
"minimum_initial_investment",
|
||||
"indirect_costs",
|
||||
"change_recoverable_expanses",
|
||||
"change_recoverable_expenses",
|
||||
"recoverable_expenses"
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -308,7 +308,8 @@ class DataMapping:
|
|||
break
|
||||
if not exist:
|
||||
data = {
|
||||
"DocumentId": doc_id,
|
||||
"doc_id": doc_id,
|
||||
"effective_date": doc_date,
|
||||
"raw_fund_name": raw_fund_name,
|
||||
"raw_share_name": raw_share_name,
|
||||
"raw_name": raw_name,
|
||||
|
|
@ -316,9 +317,7 @@ class DataMapping:
|
|||
"fund_name": fund_legal_name,
|
||||
"sec_id": share_class_id,
|
||||
"sec_name": share_class_legal_name,
|
||||
"EffectiveDate": doc_date,
|
||||
"page_index": [],
|
||||
"RawName": raw_name,
|
||||
}
|
||||
for datapoint_name in datapoint_name_list:
|
||||
data[datapoint_name] = ""
|
||||
|
|
@ -375,7 +374,8 @@ class DataMapping:
|
|||
exist = True
|
||||
if not exist:
|
||||
data = {
|
||||
"DocumentId": doc_id,
|
||||
"doc_id": doc_id,
|
||||
"effective_date": doc_date,
|
||||
"raw_fund_name": raw_fund_name,
|
||||
"raw_share_name": "",
|
||||
"raw_name": raw_name,
|
||||
|
|
@ -383,9 +383,7 @@ class DataMapping:
|
|||
"fund_name": fund_legal_name,
|
||||
"sec_id": "",
|
||||
"sec_name": "",
|
||||
"EffectiveDate": doc_date,
|
||||
"page_index": [page_index],
|
||||
"RawName": raw_name,
|
||||
}
|
||||
for datapoint_name in datapoint_name_list:
|
||||
data[datapoint_name] = ""
|
||||
|
|
|
|||
|
|
@ -118,7 +118,7 @@
|
|||
"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
|
||||
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
|
||||
"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
|
||||
"change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100."
|
||||
"change_recoverable_expenses": "Change recoverable expenses is belong to percentage number, the value should be less than 100."
|
||||
},
|
||||
"special_rule": {
|
||||
"management_fee_and_costs": [
|
||||
|
|
@ -177,7 +177,7 @@
|
|||
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
|
||||
"---Example End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
|
||||
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
|
||||
"\n",
|
||||
"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||
"---Example 1 Start---",
|
||||
|
|
@ -617,7 +617,7 @@
|
|||
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
|
||||
"indirect_costs_value": [0.12, 0.16, 0.02],
|
||||
"recoverable_expenses_value": [0.01, 0.05, 0.06],
|
||||
"change_recoverable_expanses_value": [0.01, 0.02, 0.03]
|
||||
"change_recoverable_expenses_value": [0.01, 0.02, 0.03]
|
||||
},
|
||||
"dp_reported_name" : {
|
||||
"total_annual_dollar_based_charges": "Total annual dollar based charges",
|
||||
|
|
@ -632,7 +632,7 @@
|
|||
"minimum_initial_investment": "Minimum initial investment",
|
||||
"indirect_costs": "Indirect cost",
|
||||
"recoverable_expenses": "Recoverable expenses",
|
||||
"change_recoverable_expanses": "Change recoverable expanses",
|
||||
"change_recoverable_expenses": "Change recoverable expenses",
|
||||
"establishment_fee": "Establishment fee",
|
||||
"contribution_fee": "Contribution fee",
|
||||
"withdrawal_fee": "Withdrawal fee",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,27 @@
|
|||
Example to extract data from Australia Prospectus PDF Document.
|
||||
Sample:
|
||||
{
|
||||
"doc_id": "412778803"
|
||||
}
|
||||
Author: Blade He
|
||||
---
|
||||
parameters:
|
||||
- name: Australia Prospectus Document Id
|
||||
in: body
|
||||
type: string
|
||||
required: true
|
||||
description: Example to extract data from Australia Prospectus PDF Document.
|
||||
default: {"doc_id": "412778803"}
|
||||
schema:
|
||||
required:
|
||||
- Document Id
|
||||
properties:
|
||||
doc_id:
|
||||
description: Australia Prospectus Document Id
|
||||
required: true
|
||||
type: string
|
||||
responses:
|
||||
200:
|
||||
description: succesfully.
|
||||
400:
|
||||
description: failed.
|
||||
Loading…
Reference in New Issue