diff --git a/app_aus_prospectus.py b/app_aus_prospectus.py new file mode 100644 index 0000000..90f3443 --- /dev/null +++ b/app_aus_prospectus.py @@ -0,0 +1,96 @@ +from flask import Flask, request, jsonify, render_template +from flasgger import Swagger, swag_from +from main import EMEA_AR_Parsing +from utils.logger import logger +from utils.biz_utils import clean_folder +from tqdm import tqdm +import pandas as pd +import os + + +template = { + "info": { + "title": "Australia Prospectus Data Extraction API", + "description": 'Australia Prospectus Data Extraction API', + "version": "1.0" + } +} +app = Flask(__name__) +# By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/ +swagger = Swagger(app, template=template) + + +@app.route('/automation/api/model/aus_prospectus', methods=['POST']) +@swag_from('yml/aus_prospectus.yml') +def aus_prospectus_data_extract(): + """ + Extract Australia Prospectus data from Australia Prospectus PDF document + input sample: + { + "doc_id": "412778803" + } + output: Australia Prospectus cost data as a list of dictionaries + :return: + :rtype: + """ + logger.info('Australia Prospectus data extraction begin') + doc_id = request.json.get('doc_id') + + if not doc_id: + return jsonify({"error": "doc_id is required"}), 400 + + pdf_folder = r"./data/aus_prospectus/pdf/" + output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/" + output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/" + output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/" + drilldown_folder = r"./data/aus_prospectus/output/drilldown/" + db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/" + db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/" + extract_way = "text" + + os.makedirs(pdf_folder, exist_ok=True) + os.makedirs(output_pdf_text_folder, exist_ok=True) + os.makedirs(output_extract_data_folder, exist_ok=True) + os.makedirs(output_mapping_data_folder, exist_ok=True) + os.makedirs(drilldown_folder, exist_ok=True) + os.makedirs(db_mapping_document_folder, exist_ok=True) + os.makedirs(db_mapping_provider_folder, exist_ok=True) + + clean_folder(pdf_folder) + clean_folder(output_pdf_text_folder) + clean_folder(output_extract_data_folder) + clean_folder(output_mapping_data_folder) + clean_folder(drilldown_folder) + clean_folder(db_mapping_document_folder) + clean_folder(db_mapping_provider_folder) + + re_run_extract_data = False + re_run_mapping_data = False + + try: + emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id, + doc_source="aus_prospectus", + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_folder=output_extract_data_folder, + output_mapping_data_folder=output_mapping_data_folder, + extract_way=extract_way, + drilldown_folder=drilldown_folder, + compare_with_provider=False) + doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data) + doc_mapping_data = emea_ar_parsing.mapping_data( + data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data + ) + results = {"extract_data": doc_mapping_data} + return jsonify(results) + except Exception as e: + logger.error(f"Error: {e}") + results = {"extract_data": [], + "annotation_data": [], + "error": str(e)} + return jsonify(results) + + +if __name__ == '__main__': + # Add use_reloader = False to avoid init twice + app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False) diff --git a/calc_metrics.py b/calc_metrics.py index 840ee54..783f163 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -601,7 +601,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # ravi_verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" # verify_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx" verify_fields = [ - "DocumentId", + "doc_id", "raw_fund_name", "fund_id", "fund_name", @@ -629,7 +629,7 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros # verify_data_df = raw_verify_data_df[raw_verify_data_df["sec_id"].isin(ravi_verify_data_df["sec_id"])] verify_data_df = verify_data_df[verify_fields] verify_data_df = verify_data_df.drop_duplicates() - verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) + # verify_data_df = verify_data_df.rename(columns={"DocumentId": "doc_id"}) verify_data_df.fillna("", inplace=True) verify_data_df.reset_index(drop=True, inplace=True) diff --git a/configuration/aus_prospectus/datapoint_keyword.json b/configuration/aus_prospectus/datapoint_keyword.json index 9026586..b6dc778 100644 --- a/configuration/aus_prospectus/datapoint_keyword.json +++ b/configuration/aus_prospectus/datapoint_keyword.json @@ -11,5 +11,5 @@ "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, - "change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} + "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_level.json b/configuration/aus_prospectus/datapoint_level.json index 036e792..fdb1208 100644 --- a/configuration/aus_prospectus/datapoint_level.json +++ b/configuration/aus_prospectus/datapoint_level.json @@ -11,5 +11,5 @@ "minimum_initial_investment": "fund_level", "indirect_costs": "share_level", "recoverable_expenses": "share_level", - "change_recoverable_expanses": "share_level" + "change_recoverable_expenses": "share_level" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_name.json b/configuration/aus_prospectus/datapoint_name.json index 9b3272f..893c4c7 100644 --- a/configuration/aus_prospectus/datapoint_name.json +++ b/configuration/aus_prospectus/datapoint_name.json @@ -11,5 +11,5 @@ "benchmark_name": "benchmark name", "indirect_costs": "indirect cost", "recoverable_expenses": "recoverable expenses", - "change_recoverable_expanses": "change recoverable expanses" + "change_recoverable_expenses": "change recoverable expanses" } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_reported_name.json b/configuration/aus_prospectus/datapoint_reported_name.json index c0906c0..7d913ef 100644 --- a/configuration/aus_prospectus/datapoint_reported_name.json +++ b/configuration/aus_prospectus/datapoint_reported_name.json @@ -11,5 +11,5 @@ "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}, "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]}, - "change_recoverable_expanses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} + "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]} } \ No newline at end of file diff --git a/configuration/aus_prospectus/datapoint_type.json b/configuration/aus_prospectus/datapoint_type.json index d1ed4a1..f38a099 100644 --- a/configuration/aus_prospectus/datapoint_type.json +++ b/configuration/aus_prospectus/datapoint_type.json @@ -11,5 +11,5 @@ "minimum_initial_investment": "integer", "indirect_costs": "float", "recoverable_expenses": "float", - "change_recoverable_expanses": "float" + "change_recoverable_expenses": "float" } \ No newline at end of file diff --git a/configuration/aus_prospectus/domicile_datapoints.json b/configuration/aus_prospectus/domicile_datapoints.json index c4ff806..2310ce4 100644 --- a/configuration/aus_prospectus/domicile_datapoints.json +++ b/configuration/aus_prospectus/domicile_datapoints.json @@ -32,7 +32,7 @@ "benchmark_name", "minimum_initial_investment", "indirect_costs", - "change_recoverable_expanses", + "change_recoverable_expenses", "recoverable_expenses" ] } diff --git a/core/data_mapping.py b/core/data_mapping.py index e50798e..c21c47e 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -308,7 +308,8 @@ class DataMapping: break if not exist: data = { - "DocumentId": doc_id, + "doc_id": doc_id, + "effective_date": doc_date, "raw_fund_name": raw_fund_name, "raw_share_name": raw_share_name, "raw_name": raw_name, @@ -316,9 +317,7 @@ class DataMapping: "fund_name": fund_legal_name, "sec_id": share_class_id, "sec_name": share_class_legal_name, - "EffectiveDate": doc_date, "page_index": [], - "RawName": raw_name, } for datapoint_name in datapoint_name_list: data[datapoint_name] = "" @@ -375,7 +374,8 @@ class DataMapping: exist = True if not exist: data = { - "DocumentId": doc_id, + "doc_id": doc_id, + "effective_date": doc_date, "raw_fund_name": raw_fund_name, "raw_share_name": "", "raw_name": raw_name, @@ -383,9 +383,7 @@ class DataMapping: "fund_name": fund_legal_name, "sec_id": "", "sec_name": "", - "EffectiveDate": doc_date, "page_index": [page_index], - "RawName": raw_name, } for datapoint_name in datapoint_name_list: data[datapoint_name] = "" diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 0063c89..f6a9c2e 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -118,7 +118,7 @@ "minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00", "indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.", "recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.", - "change_recoverable_expanses": "Change recoverable expanses is belong to percentage number, the value should be less than 100." + "change_recoverable_expenses": "Change recoverable expenses is belong to percentage number, the value should be less than 100." }, "special_rule": { "management_fee_and_costs": [ @@ -177,7 +177,7 @@ "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", "---Example End---", "The output should be:", - "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expanses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", + "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "\n", "D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "---Example 1 Start---", @@ -617,7 +617,7 @@ "high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"], "indirect_costs_value": [0.12, 0.16, 0.02], "recoverable_expenses_value": [0.01, 0.05, 0.06], - "change_recoverable_expanses_value": [0.01, 0.02, 0.03] + "change_recoverable_expenses_value": [0.01, 0.02, 0.03] }, "dp_reported_name" : { "total_annual_dollar_based_charges": "Total annual dollar based charges", @@ -632,7 +632,7 @@ "minimum_initial_investment": "Minimum initial investment", "indirect_costs": "Indirect cost", "recoverable_expenses": "Recoverable expenses", - "change_recoverable_expanses": "Change recoverable expanses", + "change_recoverable_expenses": "Change recoverable expenses", "establishment_fee": "Establishment fee", "contribution_fee": "Contribution fee", "withdrawal_fee": "Withdrawal fee", diff --git a/yml/aus_prospectus.yml b/yml/aus_prospectus.yml new file mode 100644 index 0000000..ca80925 --- /dev/null +++ b/yml/aus_prospectus.yml @@ -0,0 +1,27 @@ +Example to extract data from Australia Prospectus PDF Document. +Sample: + { + "doc_id": "412778803" + } +Author: Blade He +--- +parameters: + - name: Australia Prospectus Document Id + in: body + type: string + required: true + description: Example to extract data from Australia Prospectus PDF Document. + default: {"doc_id": "412778803"} + schema: + required: + - Document Id + properties: + doc_id: + description: Australia Prospectus Document Id + required: true + type: string +responses: + 200: + description: succesfully. + 400: + description: failed. \ No newline at end of file