Add API code file

This commit is contained in:
Blade He 2025-03-10 16:00:17 -05:00
parent e9f6383258
commit b7506c78f3
11 changed files with 139 additions and 18 deletions

96
app_aus_prospectus.py Normal file
View File

@ -0,0 +1,96 @@
from flask import Flask, request, jsonify, render_template
from flasgger import Swagger, swag_from
from main import EMEA_AR_Parsing
from utils.logger import logger
from utils.biz_utils import clean_folder
from tqdm import tqdm
import pandas as pd
import os
template = {
"info": {
"title": "Australia Prospectus Data Extraction API",
"description": 'Australia Prospectus Data Extraction API',
"version": "1.0"
}
}
app = Flask(__name__)
# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
swagger = Swagger(app, template=template)
@app.route('/automation/api/model/aus_prospectus', methods=['POST'])
@swag_from('yml/aus_prospectus.yml')
def aus_prospectus_data_extract():
"""
Extract Australia Prospectus data from Australia Prospectus PDF document
input sample:
{
"doc_id": "412778803"
}
output: Australia Prospectus cost data as a list of dictionaries
:return:
:rtype:
"""
logger.info('Australia Prospectus data extraction begin')
doc_id = request.json.get('doc_id')
if not doc_id:
return jsonify({"error": "doc_id is required"}), 400
pdf_folder = r"./data/aus_prospectus/pdf/"
output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/"
output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/"
output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/"
drilldown_folder = r"./data/aus_prospectus/output/drilldown/"
db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/"
db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/"
extract_way = "text"
os.makedirs(pdf_folder, exist_ok=True)
os.makedirs(output_pdf_text_folder, exist_ok=True)
os.makedirs(output_extract_data_folder, exist_ok=True)
os.makedirs(output_mapping_data_folder, exist_ok=True)
os.makedirs(drilldown_folder, exist_ok=True)
os.makedirs(db_mapping_document_folder, exist_ok=True)
os.makedirs(db_mapping_provider_folder, exist_ok=True)
clean_folder(pdf_folder)
clean_folder(output_pdf_text_folder)
clean_folder(output_extract_data_folder)
clean_folder(output_mapping_data_folder)
clean_folder(drilldown_folder)
clean_folder(db_mapping_document_folder)
clean_folder(db_mapping_provider_folder)
re_run_extract_data = False
re_run_mapping_data = False
try:
emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
doc_source="aus_prospectus",
pdf_folder=pdf_folder,
output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_folder=output_extract_data_folder,
output_mapping_data_folder=output_mapping_data_folder,
extract_way=extract_way,
drilldown_folder=drilldown_folder,
compare_with_provider=False)
doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
doc_mapping_data = emea_ar_parsing.mapping_data(
data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
)
results = {"extract_data": doc_mapping_data}
return jsonify(results)
except Exception as e:
logger.error(f"Error: {e}")
results = {"extract_data": [],
"annotation_data": [],
"error": str(e)}
return jsonify(results)
if __name__ == '__main__':
# Add use_reloader = False to avoid init twice
app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)

View File

@ -601,7 +601,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
# ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" # ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
# verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" # verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
verify_fields = [ verify_fields = [
"DocumentId", "doc_id",
"raw_fund_name", "raw_fund_name",
"fund_id", "fund_id",
"fund_name", "fund_name",
@ -629,7 +629,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
# verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])] # verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])]
verify_data_df = verify_data_df[verify_fields] verify_data_df = verify_data_df[verify_fields]
verify_data_df = verify_data_df.drop_duplicates() verify_data_df = verify_data_df.drop_duplicates()
verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) # verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"})
verify_data_df.fillna("", inplace=True) verify_data_df.fillna("", inplace=True)
verify_data_df.reset_index(drop=True, inplace=True) verify_data_df.reset_index(drop=True, inplace=True)

View File

@ -11,5 +11,5 @@
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
} }

View File

@ -11,5 +11,5 @@
"minimum_initial_investment": "fund_level", "minimum_initial_investment": "fund_level",
"indirect_costs": "share_level", "indirect_costs": "share_level",
"recoverable_expenses": "share_level", "recoverable_expenses": "share_level",
"change_recoverable_expanses": "share_level" "change_recoverable_expenses": "share_level"
} }

View File

@ -11,5 +11,5 @@
"benchmark_name": "benchmark name", "benchmark_name": "benchmark name",
"indirect_costs": "indirect cost", "indirect_costs": "indirect cost",
"recoverable_expenses": "recoverable expenses", "recoverable_expenses": "recoverable expenses",
"change_recoverable_expanses": "change recoverable expanses" "change_recoverable_expenses": "change recoverable expanses"
} }

View File

@ -11,5 +11,5 @@
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
"recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
"change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
} }

View File

@ -11,5 +11,5 @@
"minimum_initial_investment": "integer", "minimum_initial_investment": "integer",
"indirect_costs": "float", "indirect_costs": "float",
"recoverable_expenses": "float", "recoverable_expenses": "float",
"change_recoverable_expanses": "float" "change_recoverable_expenses": "float"
} }

View File

@ -32,7 +32,7 @@
"benchmark_name", "benchmark_name",
"minimum_initial_investment", "minimum_initial_investment",
"indirect_costs", "indirect_costs",
"change_recoverable_expanses", "change_recoverable_expenses",
"recoverable_expenses" "recoverable_expenses"
] ]
} }

View File

@ -308,7 +308,8 @@ class DataMapping:
break break
if not exist: if not exist:
data = { data = {
"DocumentId": doc_id, "doc_id": doc_id,
"effective_date": doc_date,
"raw_fund_name": raw_fund_name, "raw_fund_name": raw_fund_name,
"raw_share_name": raw_share_name, "raw_share_name": raw_share_name,
"raw_name": raw_name, "raw_name": raw_name,
@ -316,9 +317,7 @@ class DataMapping:
"fund_name": fund_legal_name, "fund_name": fund_legal_name,
"sec_id": share_class_id, "sec_id": share_class_id,
"sec_name": share_class_legal_name, "sec_name": share_class_legal_name,
"EffectiveDate": doc_date,
"page_index": [], "page_index": [],
"RawName": raw_name,
} }
for datapoint_name in datapoint_name_list: for datapoint_name in datapoint_name_list:
data[datapoint_name] = "" data[datapoint_name] = ""
@ -375,7 +374,8 @@ class DataMapping:
exist = True exist = True
if not exist: if not exist:
data = { data = {
"DocumentId": doc_id, "doc_id": doc_id,
"effective_date": doc_date,
"raw_fund_name": raw_fund_name, "raw_fund_name": raw_fund_name,
"raw_share_name": "", "raw_share_name": "",
"raw_name": raw_name, "raw_name": raw_name,
@ -383,9 +383,7 @@ class DataMapping:
"fund_name": fund_legal_name, "fund_name": fund_legal_name,
"sec_id": "", "sec_id": "",
"sec_name": "", "sec_name": "",
"EffectiveDate": doc_date,
"page_index": [page_index], "page_index": [page_index],
"RawName": raw_name,
} }
for datapoint_name in datapoint_name_list: for datapoint_name in datapoint_name_list:
data[datapoint_name] = "" data[datapoint_name] = ""

View File

@ -118,7 +118,7 @@
"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00", "minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.", "indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.", "recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
"change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100." "change_recoverable_expenses": "Change recoverable expenses is belong to percentage number, the value should be less than 100."
}, },
"special_rule": { "special_rule": {
"management_fee_and_costs": [ "management_fee_and_costs": [
@ -177,7 +177,7 @@
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n",
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
"\n", "\n",
"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---", "---Example 1 Start---",
@ -617,7 +617,7 @@
"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
"indirect_costs_value": [0.12, 0.16, 0.02], "indirect_costs_value": [0.12, 0.16, 0.02],
"recoverable_expenses_value": [0.01, 0.05, 0.06], "recoverable_expenses_value": [0.01, 0.05, 0.06],
"change_recoverable_expanses_value": [0.01, 0.02, 0.03] "change_recoverable_expenses_value": [0.01, 0.02, 0.03]
}, },
"dp_reported_name" : { "dp_reported_name" : {
"total_annual_dollar_based_charges": "Total annual dollar based charges", "total_annual_dollar_based_charges": "Total annual dollar based charges",
@ -632,7 +632,7 @@
"minimum_initial_investment": "Minimum initial investment", "minimum_initial_investment": "Minimum initial investment",
"indirect_costs": "Indirect cost", "indirect_costs": "Indirect cost",
"recoverable_expenses": "Recoverable expenses", "recoverable_expenses": "Recoverable expenses",
"change_recoverable_expanses": "Change recoverable expanses", "change_recoverable_expenses": "Change recoverable expenses",
"establishment_fee": "Establishment fee", "establishment_fee": "Establishment fee",
"contribution_fee": "Contribution fee", "contribution_fee": "Contribution fee",
"withdrawal_fee": "Withdrawal fee", "withdrawal_fee": "Withdrawal fee",

27
yml/aus_prospectus.yml Normal file
View File

@ -0,0 +1,27 @@
Example to extract data from Australia Prospectus PDF Document.
Sample:
{
"doc_id": "412778803"
}
Author: Blade He
---
parameters:
- name: Australia Prospectus Document Id
in: body
type: string
required: true
description: Example to extract data from Australia Prospectus PDF Document.
default: {"doc_id": "412778803"}
schema:
required:
- Document Id
properties:
doc_id:
description: Australia Prospectus Document Id
required: true
type: string
responses:
200:
description: succesfully.
400:
description: failed.