fix issue

update for apply ALI QWEN as Demo
add mini_main.py
2025-11-12 14:07:55 +08:00 · 2025-11-11 13:33:57 +08:00 · 2025-11-10 16:55:55 +08:00 · 2025-04-03 18:08:27 -05:00 · 2025-04-03 17:06:43 -05:00 · 2025-04-02 20:39:31 -05:00
49 changed files with 6538 additions and 699 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,7 +3,6 @@
 /utils/__pycache__
 /__pycache__/*.pyc
 /core/__pycache__/*.pyc
 /test_calc_metrics.py
 /test_metrics
 /data
 /sample_documents/japan_prospectus.txt
@ -14,3 +13,7 @@
 /test_specific_biz_logic.py
 /drilldown_practice.py
 /core/auz_nz/__pycache__/*.pyc
 /performance.ipynb
 /sample_documents/special_cases.txt
 /aus-prospectus/
 /output/log/*.log
--- a/app_aus_prospectus.py
+++ b/app_aus_prospectus.py
@ -0,0 +1,96 @@
 from flask import Flask, request, jsonify, render_template  
 from flasgger import Swagger, swag_from
 from main import EMEA_AR_Parsing
 from utils.logger import logger
 from utils.biz_utils import clean_folder
 from tqdm import tqdm
 import pandas as pd
 import os
 template = {
    "info": {
        "title": "Australia Prospectus Data Extraction API",
        "description": 'Australia Prospectus Data Extraction API',
        "version": "1.0"
    }
 }
 app = Flask(__name__)
 # By Swagger, we can see the API documentation in the browser, the example URL is http://127.0.0.1:8080/apidocs/
 swagger = Swagger(app, template=template)
@app.route('/automation/api/model/aus_prospectus', methods=['POST'])
@swag_from('yml/aus_prospectus.yml')
 def aus_prospectus_data_extract():
    """
    Extract Australia Prospectus data from Australia Prospectus PDF document
    input sample:
    {
        "doc_id": "412778803"
    }
    output: Australia Prospectus cost data as a list of dictionaries
    :return:
    :rtype:
    """
    logger.info('Australia Prospectus data extraction begin')
    doc_id = request.json.get('doc_id')
    if not doc_id:  
        return jsonify({"error": "doc_id is required"}), 400
    pdf_folder = r"./data/aus_prospectus/pdf/"
    output_pdf_text_folder = r"./data/aus_prospectus/output/pdf_text/"
    output_extract_data_folder = r"./data/aus_prospectus/output/extract_data/docs/"
    output_mapping_data_folder = r"./data/aus_prospectus/output/mapping_data/docs/"
    drilldown_folder = r"./data/aus_prospectus/output/drilldown/"
    db_mapping_document_folder = r"./data/aus_prospectus/output/db_mapping/document/"
    db_mapping_provider_folder = r"./data/aus_prospectus/output/db_mapping/provider/"
    extract_way = "text"
    os.makedirs(pdf_folder, exist_ok=True)
    os.makedirs(output_pdf_text_folder, exist_ok=True)
    os.makedirs(output_extract_data_folder, exist_ok=True)
    os.makedirs(output_mapping_data_folder, exist_ok=True)
    os.makedirs(drilldown_folder, exist_ok=True)
    os.makedirs(db_mapping_document_folder, exist_ok=True)
    os.makedirs(db_mapping_provider_folder, exist_ok=True)
    clean_folder(pdf_folder)
    clean_folder(output_pdf_text_folder)
    clean_folder(output_extract_data_folder)
    clean_folder(output_mapping_data_folder)
    clean_folder(drilldown_folder)
    clean_folder(db_mapping_document_folder)
    clean_folder(db_mapping_provider_folder)
    re_run_extract_data = False
    re_run_mapping_data = False
    try:
        emea_ar_parsing = EMEA_AR_Parsing(doc_id=doc_id,
                                          doc_source="aus_prospectus",
                                          pdf_folder=pdf_folder,
                                          output_pdf_text_folder=output_pdf_text_folder,
                                          output_extract_data_folder=output_extract_data_folder,
                                          output_mapping_data_folder=output_mapping_data_folder,
                                          extract_way=extract_way,
                                          drilldown_folder=drilldown_folder,
                                          compare_with_provider=False)
        doc_data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run=re_run_extract_data)
        doc_mapping_data = emea_ar_parsing.mapping_data(
            data_from_gpt=doc_data_from_gpt, re_run=re_run_mapping_data
        )
        results = {"extract_data": doc_mapping_data}
        return jsonify(results)
    except Exception as e:
        logger.error(f"Error: {e}")
        results = {"extract_data": [],
                   "annotation_data": [],
                   "error": str(e)}
        return jsonify(results)
 if __name__ == '__main__':
    # Add use_reloader = False to avoid init twice
    app.run(host='0.0.0.0', port="8080", debug=False, use_reloader=False)
--- a/calc_metrics.py
+++ b/calc_metrics.py
--- a/configuration/aus_prospectus/abbreviation_records.json
+++ b/configuration/aus_prospectus/abbreviation_records.json
@ -321,6 +321,7 @@
    "Edu": "Education",
    "Elevation Fds (IE)": "Elevation UCITS Funds (Ireland) ICAV",
    "E": "Elite",
    "EF": "Entry Fee",
    "Emgnt": "Emergente",
    "Em": "Emerging",
    "Emerg": "Emerging",
@ -678,6 +679,7 @@
    "Nbg Bm": "Neuberger Berman",
    "Nflz": "Neuflize",
    "Netrl": "Neutral",
    "NEF": "Nil Entry",
    "New Capital": "New Capital Fund Lux",
    "Nwtn": "Newton",
    "NN (B) Invest": "NN (B) Invest",
--- a/configuration/aus_prospectus/all_datapoints/datapoint_keyword_all.json
+++ b/configuration/aus_prospectus/all_datapoints/datapoint_keyword_all.json
@ -0,0 +1,27 @@
 {
  "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
  "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs", "Management costs"]},
  "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
  "performance_fee": {"english": ["performance fee", "performance fees"]},
  "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
  "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
  "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
  "establishment_fee": {"english": ["establishment fee", "establishment fees"]},
  "contribution_fee": {"english": ["contribution fee", "contribution fees"]},
  "withdrawal_fee": {"english": ["withdrawal fee", "withdrawal fees"]},
  "switching_fee": {"english": ["switching fee", "switching fees"]},
  "activity_fee": {"english": ["activity fee", "activity fees"]},
  "exit_fee": {"english": ["exit fee", "exit fees"]},
  "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
  "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
  "additional_hurdle": {"english": ["additional hurdle","performance hardle"]},
  "benchmark_name": {"english": ["benchmark fund","benchmark name"]},
  "reference_rate": {"english": ["reference rate"]},
  "crystallisation_frequency": {"english": ["crystallisation frequency"]},
  "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
  "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
  "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
  "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
  "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
  "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
 }
--- a/configuration/aus_prospectus/all_datapoints/datapoint_level_all.json
+++ b/configuration/aus_prospectus/all_datapoints/datapoint_level_all.json
@ -0,0 +1,27 @@
 {
    "total_annual_dollar_based_charges": "share_level",
    "management_fee_and_costs": "share_level",
    "management_fee": "share_level",
    "performance_fee": "share_level",
    "performance_fee_costs": "share_level",
    "buy_spread": "share_level",
    "sell_spread": "share_level",
    "establishment_fee": "share_level",
    "contribution_fee": "share_level",
    "withdrawal_fee": "share_level",
    "switching_fee": "share_level",
    "activity_fee": "share_level",
    "exit_fee": "share_level", 
    "administration_fees": "share_level",
    "interposed_vehicle_performance_fee_cost": "share_level", 
    "additional_hurdle": "share_level",
    "benchmark_name": "fund_level",
    "reference_rate": "share_level",
    "crystallisation_frequency": "share_level",
    "date_of_last_hwm_reset": "share_level",
    "date_of_last_performance_fee_restructure": "share_level",
    "high_water_mark_type": "share_level",
    "minimum_initial_investment": "fund_level",
    "recoverable_expenses": "share_level",
    "indirect_costs": "share_level"
 }
--- a/configuration/aus_prospectus/all_datapoints/datapoint_name_all.json
+++ b/configuration/aus_prospectus/all_datapoints/datapoint_name_all.json
@ -0,0 +1,27 @@
 {
    "total_annual_dollar_based_charges": "total annual dollar based charges",
    "management_fee_and_costs": "management fee and costs",
    "management_fee": "management fee",
    "performance_fee": "performance fee",
    "performance_fee_costs": "performance fee costs",
    "buy_spread": "buy spread",
    "sell_spread": "sell spread",
    "establishment_fee": "establishment fee",
    "contribution_fee": "contribution fee",
    "withdrawal_fee": "withdrawal fee",
    "switching_fee": "switching fee",
    "activity_fee": "activity fee",
    "exit_fee": "exit fee", 
    "administration_fees": "administration fee",
    "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", 
    "additional_hurdle": "additional hurdle",
    "benchmark_name": "benchmark name",
    "reference_rate": "reference rate",
    "crystallisation_frequency": "crystallisation frequency",
    "date_of_last_hwm_reset": "date of last hwm reset",
    "date_of_last_performance_fee_restructure": "date of last performance fee restructure",
    "high_water_mark_type": "high-water mark type",
    "minimum_initial_investment": "minimum initial investment",
    "recoverable_expenses": "recoverable expenses",
    "indirect_costs": "indirect cost"
 }
--- a/configuration/aus_prospectus/all_datapoints/datapoint_reported_name_all.json
+++ b/configuration/aus_prospectus/all_datapoints/datapoint_reported_name_all.json
@ -0,0 +1,27 @@
 {
    "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
    "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs"]},
    "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs"]},
    "performance_fee": {"english": ["performance fee", "performance fees"]},
    "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
    "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
    "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
    "establishment_fee": {"english": ["establishment fee", "establishment fees"]},
    "contribution_fee": {"english": ["contribution fee", "contribution fees"]},
    "withdrawal_fee": {"english": ["withdrawal fee", "withdrawal fees"]},
    "switching_fee": {"english": ["switching fee", "switching fees"]},
    "activity_fee": {"english": ["activity fee", "activity fees"]},
    "exit_fee": {"english": ["exit fee", "exit fees"]},
    "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
    "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
    "additional_hurdle": {"english": ["additional hurdle","performance hardle"]},
    "benchmark_name": {"english": ["benchmark fund","benchmark name"]},
    "reference_rate": {"english": ["reference rate"]},
    "crystallisation_frequency": {"english": ["crystallisation frequency"]},
    "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
    "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
    "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
    "minimum_initial_investment": {"english": ["minimum initial investment","inital investment", "initial investment amount"]},
    "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
    "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
  }
--- a/configuration/aus_prospectus/all_datapoints/datapoint_type_all.json
+++ b/configuration/aus_prospectus/all_datapoints/datapoint_type_all.json
@ -0,0 +1,27 @@
 {
    "total_annual_dollar_based_charges": "float",
    "management_fee_and_costs": "float",
    "management_fee": "float",
    "performance_fee": "float",
    "performance_fee_costs": "float",
    "buy_spread": "float",
    "sell_spread": "float",
    "establishment_fee": "float",
    "contribution_fee": "float",
    "withdrawal_fee": "float",
    "switching_fee": "float",
    "activity_fee": "float",
    "exit_fee": "float", 
    "administration_fees": "float",
    "interposed_vehicle_performance_fee_cost": "float", 
    "additional_hurdle": "text",
    "benchmark_name": "text",
    "reference_rate": "float",
    "crystallisation_frequency": "text",
    "date_of_last_hwm_reset": "text",
    "date_of_last_performance_fee_restructure": "text",
    "high_water_mark_type": "text",
    "minimum_initial_investment": "integer",
    "recoverable_expenses": "float",
    "indirect_costs": "float"
 }
--- a/configuration/aus_prospectus/all_datapoints/domicile_datapoints_all.json
+++ b/configuration/aus_prospectus/all_datapoints/domicile_datapoints_all.json
@ -0,0 +1,51 @@
 {
  "CAN": {
    "ar": [
      "mer",
      "tor",
      "trading expense ratio"
    ]
  },
  "IND": {
    "ar": [
      "ter",
      "MgtFee",
      "tor"
    ]
  },
  "default": {
    "ar": [
      "tor",
      "ter",
      "ogc",
      "performance_fee"
    ],
    "prospectus": [
      "total_annual_dollar_based_charges",
      "management_fee_and_costs",
      "management_fee",
      "performance_fee",
      "performance_fee_costs",
      "buy_spread",
      "sell_spread",
      "establishment_fee",
      "contribution_fee",
      "withdrawal_fee",
      "switching_fee",
      "activity_fee",
      "exit_fee", 
      "administration_fees",
      "interposed_vehicle_performance_fee_cost", 
      "additional_hurdle",
      "benchmark_name",
      "reference_rate",
      "crystallisation_frequency",
      "date_of_last_hwm_reset",
      "date_of_last_performance_fee_restructure",
      "high_water_mark_type",
      "minimum_initial_investment",
      "recoverable_expenses",
      "indirect_costs"
    ]
  }
 }
--- a/configuration/aus_prospectus/datapoint_keyword.json
+++ b/configuration/aus_prospectus/datapoint_keyword.json
@ -1,27 +1,15 @@
 {
-  "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
+  "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar", "administration fees and costs", "Administration fee", "Administration fees"]},
-  "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "Plus other investment fees and costs"]},
+  "management_fee_and_costs": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
-  "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
+  "management_fee": {"english": ["management fee", "management fees", "investment management fees", "management fees and cost", "management fees and costs", "investment fees and costs", "Management costs", "investment fee and costs", "Investment fees", "investment option management costs", "investment option management costs1"]},
-  "performance_fee": {"english": ["performance fee", "performance fees"]},
+  "performance_fee_costs": {"english": ["performance fee", "performance fees"]},
  "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
  "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
  "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
  "establishment_fee": {"english": ["establishment fee", "establishment fees"]},
  "contribution_fee": {"english": ["contribution fee", "contribution fees"]},
  "withdrawal_fee": {"english": ["withdrawal fee", "withdrawal fees"]},
  "switching_fee": {"english": ["switching fee", "switching fees"]},
  "activity_fee": {"english": ["activity fee", "activity fees"]},
  "exit_fee": {"english": ["exit fee", "exit fees"]},
  "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
-  "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
+  "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance", "interposed vehicles", "interposed vehicle"]},
-  "additional_hurdle": {"english": ["additional hurdle","performance hardle"]},
+  "benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
-  "benchmark_name": {"english": ["benchmark fund","benchmark name"]},
+  "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
-  "reference_rate": {"english": ["reference rate"]},
+  "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
-  "crystallisation_frequency": {"english": ["crystallisation frequency"]},
+  "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
-  "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
+  "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
  "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
  "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
  "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
  "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
  "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
 }
--- a/configuration/aus_prospectus/datapoint_level.json
+++ b/configuration/aus_prospectus/datapoint_level.json
@ -2,26 +2,14 @@
    "total_annual_dollar_based_charges": "share_level",
    "management_fee_and_costs": "share_level",
    "management_fee": "share_level",
    "performance_fee": "share_level",
    "performance_fee_costs": "share_level",
    "buy_spread": "share_level",
    "sell_spread": "share_level",
    "establishment_fee": "share_level",
    "contribution_fee": "share_level",
    "withdrawal_fee": "share_level",
    "switching_fee": "share_level",
    "activity_fee": "share_level",
    "exit_fee": "share_level", 
    "administration_fees": "share_level",
    "interposed_vehicle_performance_fee_cost": "share_level", 
    "additional_hurdle": "share_level",
    "benchmark_name": "fund_level",
-    "reference_rate": "share_level",
+    "minimum_initial_investment": "fund_level",
-    "crystallisation_frequency": "share_level",
+    "indirect_costs": "share_level",
    "date_of_last_hwm_reset": "share_level",
    "date_of_last_performance_fee_restructure": "share_level",
    "high_water_mark_type": "share_level",
    "minimum_initial_investment": "share_level",
    "recoverable_expenses": "share_level",
-    "indirect_costs": "share_level"
+    "change_recoverable_expenses": "share_level"
 }
--- a/configuration/aus_prospectus/datapoint_name.json
+++ b/configuration/aus_prospectus/datapoint_name.json
@ -1,27 +1,15 @@
 {
    "total_annual_dollar_based_charges": "total annual dollar based charges",
    "management_fee_and_costs": "management fee and costs",
    "management_fee": "management fee",
-    "performance_fee": "performance fee",
+    "administration_fees": "administration fee",
-    "performance_fee_costs": "performance fee costs",
+    "performance_fee_costs": "performance fee",
    "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost",
    "buy_spread": "buy spread",
    "sell_spread": "sell spread",
-    "establishment_fee": "establishment fee",
+    "total_annual_dollar_based_charges": "total annual dollar based charges",
    "contribution_fee": "contribution fee",
    "withdrawal_fee": "withdrawal fee",
    "switching_fee": "switching fee",
    "activity_fee": "activity fee",
    "exit_fee": "exit fee", 
    "administration_fees": "administration fee",
    "interposed_vehicle_performance_fee_cost": "interposed vehicle performance fee cost", 
    "additional_hurdle": "additional hurdle",
    "benchmark_name": "benchmark name",
    "reference_rate": "reference rate",
    "crystallisation_frequency": "crystallisation frequency",
    "date_of_last_hwm_reset": "date of last hwm reset",
    "date_of_last_performance_fee_restructure": "date of last performance fee restructure",
    "high_water_mark_type": "high-water mark type",
    "minimum_initial_investment": "minimum initial investment", 
    "benchmark_name": "benchmark name",
    "indirect_costs": "indirect cost",
    "recoverable_expenses": "recoverable expenses",
-    "indirect_costs": "indirect cost"
+    "change_recoverable_expenses": "change recoverable expanses"
 }
--- a/configuration/aus_prospectus/datapoint_reported_name.json
+++ b/configuration/aus_prospectus/datapoint_reported_name.json
@ -1,27 +1,15 @@
 {
-    "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar"]},
+    "total_annual_dollar_based_charges": {"english": ["total annual dollar based charges", "total annual dollar based charges ($)","total annual dollar", "administration fees and costs", "Administration fee", "Administration fees"]},
-    "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs"]},
+    "management_fee_and_costs": {"english": ["management fees and cost", "management fees and costs", "management fee and cost", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
-    "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs"]},
+    "management_fee": {"english": ["management fee", "management fees","investment management fees","management fees and cost", "investment option management costs", "investment option management costs1", "Plus other investment fees and costs", "Management costs", "investment fees and costs", "investment fee and cost", "Investment fees"]},
-    "performance_fee": {"english": ["performance fee", "performance fees"]},
+    "performance_fee_costs": {"english": ["performance fee", "performance fees"]},
    "performance_fee_costs": {"english": ["performance fee costs", "performance fees costs"]},
    "buy_spread": {"english": ["buy-spread", "buy spread", "buy/sell spreads", "BUY-SELL SPREAD"]},
    "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
    "establishment_fee": {"english": ["establishment fee", "establishment fees"]},
    "contribution_fee": {"english": ["contribution fee", "contribution fees"]},
    "withdrawal_fee": {"english": ["withdrawal fee", "withdrawal fees"]},
    "switching_fee": {"english": ["switching fee", "switching fees"]},
    "activity_fee": {"english": ["activity fee", "activity fees"]},
    "exit_fee": {"english": ["exit fee", "exit fees"]},
    "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
    "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
-    "additional_hurdle": {"english": ["additional hurdle","performance hardle"]},
+    "benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
-    "benchmark_name": {"english": ["benchmark fund","benchmark name"]},
+    "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
-    "reference_rate": {"english": ["reference rate"]},
+    "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]},
-    "crystallisation_frequency": {"english": ["crystallisation frequency"]},
+    "recoverable_expenses": {"english": ["recoverable expenses", "recoverable expense", "recoverable cost", "recoverable costs", "expense recoveries"]},
-    "date_of_last_hwm_reset": {"english": ["date of last hwm reset"]},
+    "change_recoverable_expenses": {"english": ["change recoverable expanses","change expanse recovery","change expanse recoveries","change recoverable expanse"]}
    "date_of_last_performance_fee_restructure": {"english": ["date of last performance fee restructure"]},
    "high_water_mark_type": {"english": ["high-water mark type", "high water mark type"]},
    "minimum_initial_investment": {"english": ["minimum initial investment","inital investment"]},
    "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
    "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}
  }
--- a/configuration/aus_prospectus/datapoint_type.json
+++ b/configuration/aus_prospectus/datapoint_type.json
@ -0,0 +1,15 @@
 {
    "total_annual_dollar_based_charges": "float",
    "management_fee_and_costs": "float",
    "management_fee": "float",
    "performance_fee_costs": "float",
    "buy_spread": "float",
    "sell_spread": "float",
    "administration_fees": "float",
    "interposed_vehicle_performance_fee_cost": "float", 
    "benchmark_name": "text",
    "minimum_initial_investment": "integer",
    "indirect_costs": "float",
    "recoverable_expenses": "float",
    "change_recoverable_expenses": "float"
 }
--- a/configuration/aus_prospectus/domicile_datapoints.json
+++ b/configuration/aus_prospectus/domicile_datapoints.json
@ -24,28 +24,16 @@
      "total_annual_dollar_based_charges",
      "management_fee_and_costs",
      "management_fee",
      "performance_fee",
      "performance_fee_costs",
      "buy_spread",
      "sell_spread",
      "establishment_fee",
      "contribution_fee",
      "withdrawal_fee",
      "switching_fee",
      "activity_fee",
      "exit_fee", 
      "administration_fees",
      "interposed_vehicle_performance_fee_cost", 
      "additional_hurdle",
      "benchmark_name",
      "reference_rate",
      "crystallisation_frequency",
      "date_of_last_hwm_reset",
      "date_of_last_performance_fee_restructure",
      "high_water_mark_type",
      "minimum_initial_investment",
-      "recoverable_expenses",
+      "indirect_costs",
-      "indirect_costs"
+      "change_recoverable_expenses",
      "recoverable_expenses"
    ]
  }
 }
--- a/configuration/aus_prospectus/misc_config.json
+++ b/configuration/aus_prospectus/misc_config.json
@ -1,4 +1,4 @@
 {
-    "apply_pdf2html": true,
+    "apply_pdf2html": false,
    "apply_drilldown": false
 }
--- a/configuration/aus_prospectus/replace_table_header.json
+++ b/configuration/aus_prospectus/replace_table_header.json
@ -0,0 +1,47 @@
 {
    "details": [
        {
            "regex_all_list": 
            ["\\nIndirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n",
            "\\n(Investment\\s*Option|Fund)[\\s\\S]*?Management\\s*fee[\\s\\S]*?Indirect\\s*costs[\\s\\S]*?performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?indirect\\s*costs[\\s\\S]*?(interposed\\s*vehicles|managers\\s*vehicles)\\s*\\n",
            "\\nOption\\s*name\\s*Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Management\\s*fee[\\s\\S]*?Buy\\/sell\\s*spreads[\\s\\S]*?Recoverable\\s*expenses[\\s\\S]*?interposed\\s*vehicles\\s*\\n"],
            "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
            "comments": ["item 0: document 410899007",
                         "item 1: document 539266880, 539266817, 539261734",
                         "item 2: document 539266893"]
        },
        {
            "regex_all_list":
            ["Indirect costs[\\s\\S]*?Estimated\\s*performance\\s*fees[\\s\\S]*?Investment\\s*Option\\s*Management\\s*fee[\\s\\S]*?Transactions\\s*costs[\\s\\S]*?Buy\\/sell\\s*spreads\\s*\\(\\%\\)\\s*\\n"],
            "replace_text": "\nInvestment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n",
            "comments": ["item 0: document 410899007"]
        },
        {
            "regex_all_list":
            ["Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Entry\\s*Fee[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Other\\s*investment\\s*costs[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Nil\\s*Entry\\s*Fee\\s*.*\\n",
             "Management\\s*Fees\\s*and\\s*costs\\s*[\\s\\S]*?Ongoing\\s*Fee.*?\\(A\\)[\\s\\S]*?\\(D\\)\\s*Total\\s*Fees\\s*and\\s*Costs\\s*Investment\\s*fund\\s*Estimated\\s*Other[\\s\\S]*?Entry\\s*Fee\\s*Nil\\s*Entry[\\s\\S]*?Nil\\s*Entry[\\s\\S]*?Performance\\s*fees[\\s\\S]*?Transaction\\s*costs[\\s\\S]*?Fee\\s*option.*\\n"],
            "replace_text": "\nManagement Fees and costs \nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \n",
            "comments": ["item 0: document 401212184, page 17",
                         "item 1: document 401212184, page 18 - 20"]
        },
        {
            "regex_all_list":
            ["Investment\\s*option\\s*Administration fees[\\s\\S]*?administration\\s*costs\\s*Investment\\s*fees[\\s\\S]*?investment\\s*costs\\s*Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Estimated\\s*administration[\\s\\S]*?transaction\\s*costs[\\s\\S]*?annual\\s*fees\\s*and\\s*costs\\s*\\(\\%\\s*pa\\)\\s*\\n"],
            "replace_text": "\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \n",
            "comments": ["item 0: document 411062815, page 17"]
        },
        {
            "regex_all_list":
            ["\\nFund\\s*name\\s*Management\\s*fee\\s*Indirect\\s*costs\\s*Recoverable\\s*expenses[\\s\\S]*?performance.*\\s*fee\\s*Estimated\\s*other\\s*indirect\\s*costs\\s*\\n"],
            "replace_text": "\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \n",
            "comments": ["item 0: document 391080133, page 21"]
        },
        {
            "regex_all_list":
            ["The\\s*investment\\s*fees\\s*and\\s*costs[\\s\\S]*?Performance\\s*fee\\s*Plus\\s*other\\s*investment\\s*fees\\s*and\\s*costs\\s*Equals\\s*investment\\s*fees\\s*and\\s*costs\\s*Transaction\\s*costs[\\s\\S]*?Buy\\-sell\\s*spreads\\s*Transaction[\\s\\S]*?Entry[\\s\\S]*pa\\s*\\n"],
            "replace_text": "Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross)\n",
            "comments": ["item 0: document 420339794, page 74"]
        }
    ]
 }
--- a/configuration/aus_prospectus/special_datapoint_feature.json
+++ b/configuration/aus_prospectus/special_datapoint_feature.json
@ -0,0 +1,42 @@
 {
    "management_fee_including_performance_fee": {
        "details": [
            {"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
            "effective_datapoints": ["management_fee_and_costs"],
            "exclude_datapoints": ["performance_fee_costs"]},
            {"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
            "effective_datapoints": ["performance_fee_costs"],
            "exclude_datapoints": ["management_fee_and_costs"]}
        ],
        "provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK", 
        "0C000069VJ", "0C0000AL58", "0C00006B9E", 
        "0C00006BDB", "0C00006BDD", "0C00006BDG", 
        "0C000035YC", "0C0000CSKN", "0C00005549",
         "0C000051C6", "0C00008JA0", "0C000093Z4", 
         "0C0000B5L6", "0C00006EGK", "0C00006EJI", 
         "0C00006FYL", "0C00006G0Q", "0C00006GIF", 
         "0C00006GNW", "0C00006GPU", "0C00006H46", 
         "0C00006H4J", "0C00006H4Q", "0C0000A5XQ", 
         "0C0000BBPL", "0C0000C2MS", "0C0000CVRL", 
         "0C0000AV6P", "0C00001XXQ", "0C00001XYR", 
         "0C00006AZB", "0C00006BN6", "0C00006BXE", 
         "0C00006CIK", "0C00006CJ2", "0C00006DOA", 
         "0C0000CAQF", "0C0000CAQH", "0C0000CAQO", 
         "0C0000CAQR"],
        "provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd", 
        "Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd", 
        "RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd", 
        "JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd", 
        "Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd", 
        "GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd", 
        "Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd", 
        "Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd", 
        "Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd", 
        "Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd", 
        "Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd", 
        "AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd", 
        "Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd", 
        "KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd", 
        "PostSuper Pty Ltd", "Legal Super Pty Ltd"]
    }
 }
--- a/configuration/emea_ar/datapoint_type.json
+++ b/configuration/emea_ar/datapoint_type.json
@ -0,0 +1,6 @@
 {
    "tor": "float",
    "ogc": "float",
    "ter": "float",
    "performance_fee": "float"
 }
--- a/configuration/emea_ar/misc_config.json
+++ b/configuration/emea_ar/misc_config.json
@ -1,4 +1,4 @@
 {
    "apply_pdf2html": false,
-    "apply_drilldown": true
+    "apply_drilldown": false
 }
--- a/configuration/emea_ar/replace_table_header.json
+++ b/configuration/emea_ar/replace_table_header.json
@ -0,0 +1,3 @@
 {
    "details": []
 }
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@ -42,7 +42,9 @@ def get_abb_json(doc_source: str = "aus_prospectus"):
 def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
    """Replaces abbreviations in a fund name with their expanded forms."""
    # Convert fund name to lowercase while matching
-    f_list = fundname.lower().split()
+    # replace special characters with space
    f_list = re.sub(r'[^a-zA-Z0-9\s]', ' ', fundname).lower().split()
    # f_list = fundname.lower().split()
    get_abb_json(doc_source)
    updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
    return " ".join(updated_doc_fname_words)
@ -616,6 +618,7 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
            print("Error: ",e)
    # print("==>>> DB LIST: ",unmatched_db_list)
    # print("==>>> PRED LIST: ",unmatched_pred_list)
    unmatched_db_list = db_list.copy()
    if len(unmatched_pred_list)!=0:
        cleaned_unmatched_pred_list = replace_abbrevs_in_fundnames(unmatched_pred_list, doc_source)
        cleaned_unmatched_pred_list = remove_stopwords_nltk(cleaned_unmatched_pred_list)
@ -625,16 +628,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
        cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
        cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
        prompt_context = f"""
-    {prompt_instruction}
+        {prompt_instruction}
-    provider_name: {provider_name}
+        provider_name: {provider_name}
-    prediction_fund: 
+        prediction_fund: 
-    {cleaned_unmatched_pred_list}
+        {cleaned_unmatched_pred_list}
-    true_fund: 
+        true_fund: 
-    {cleaned_unmatched_db_list}
+        {cleaned_unmatched_db_list}
-    """
+        """
        # print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list)
        # print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list)
        # llm_response = get_llm_response(prompt_context)
@ -650,59 +653,114 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
                    llm_result = json_repair.loads(llm_response['response'])
                except:
                    llm_result = {}
-            # try:
+            unmantched_pred_index_list = post_handle_fund_matching_call(llm_result, 
-            #     llm_result = ast.literal_eval(llm_response['response'].replace('\n',''))
+                                                                        unmatched_pred_list, 
-            # except Exception as e:
+                                                                        cleaned_unmatched_pred_list, 
-            #     logger.info(f"error: {e}")
+                                                                        unmatched_db_list, 
-            #     cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '')
+                                                                        cleaned_unmatched_db_list, 
-            #     llm_result = json.loads(cleaned_response)
+                                                                        df_data, 
-            # logger.info(f"\n\n llm_result: {llm_result}")            
+                                                                        final_result,
-            for k,v in llm_result.items():
+                                                                        record_empty=False)
-                # print("k: ",k)
+            """
-                # print("v: ",v)
+            For some cases, same document, 
-                og_db_index=-1
+            perhaps same funds/ shares are with different raw names in different pages.
-                og_pred_index = -1
+            e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
-                if k in cleaned_unmatched_pred_list:
+            But if only call ChatGPT API one time, it will not be able to match all of them.
-                    og_pred_index = cleaned_unmatched_pred_list.index(k)
+            """
            if len(unmantched_pred_index_list)>0:
                unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
                cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
                prompt_context = f"""
                {prompt_instruction}
-                if og_pred_index == -1:
+                provider_name: {provider_name}
                    # sometimes, the raw name and db name reversed from the LLM response
                    if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list:
                        og_pred_index = cleaned_unmatched_pred_list.index(v)
                        og_db_index = cleaned_unmatched_db_list.index(k)
                        # v and k are swapped
                        temp = v
                        v = k
                        k = temp
                if og_pred_index==-1:
                    continue
                # og_db_index = cleaned_unmatched_db_list.index(v)
                if og_db_index == -1 and v in cleaned_unmatched_db_list:
                    og_db_index = cleaned_unmatched_db_list.index(v)
                # print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
                # print("unmatched_db_list: ",unmatched_db_list)
-                for i in df_data:
+                prediction_fund: 
-                    if i['pred_fund']==unmatched_pred_list[og_pred_index]:
+                {cleaned_unmatched_pred_list}
                        if og_db_index!=-1:
                            i['db_fund']=unmatched_db_list[og_db_index]
                            i['cleaned_db_fund_name'] = v
                            final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
                        else:
                            i['db_fund'] = ''
                            i['cleaned_db_fund_name'] = ''
                            final_result.update({unmatched_pred_list[og_pred_index]:""})
                        i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
                        i['llm_clean_db_list'] = cleaned_unmatched_db_list,
                        i['llm_pred_fund'] = k
                        i['llm_matched_db_name'] = v
                        i['llm_result'] = llm_result
                        break
                true_fund: 
                {cleaned_unmatched_db_list}
                """
                llm_response, with_error = chat(
                    prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
                    )
                # logger.info(f"fund matching LLM Response: {llm_response}")
                if 'response' in llm_response.keys():
                    try:
                        llm_result = json.loads(llm_response['response'])
                    except:
                        try:
                            llm_result = json_repair.loads(llm_response['response'])
                        except:
                            llm_result = {}
                    unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
                                                                            unmatched_pred_list, 
                                                                            cleaned_unmatched_pred_list, 
                                                                            unmatched_db_list, 
                                                                            cleaned_unmatched_db_list, 
                                                                            df_data, 
                                                                            final_result,
                                                                            record_empty=True)
        # break
    return final_result
 def post_handle_fund_matching_call(llm_result, 
                                   unmatched_pred_list, 
                                   cleaned_unmatched_pred_list, 
                                   unmatched_db_list, 
                                   cleaned_unmatched_db_list, 
                                   df_data, 
                                   final_result,
                                   record_empty: bool = False):
    unmantched_pred_index_list = []
    for pred_name,db_name in llm_result.items():
        og_db_index=-1
        og_pred_index_list = []
        if pred_name in cleaned_unmatched_pred_list:
            for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
                if c_item==pred_name:
                    og_pred_index_list.append(c_idx)
        if len(og_pred_index_list) == 0:
            # sometimes, the raw name and db name reversed from the LLM response
            if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
                for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
                    if c_item==db_name:
                        og_pred_index_list.append(c_idx)
                og_db_index = cleaned_unmatched_db_list.index(pred_name)
                # v and k are swapped
                temp = db_name
                db_name = pred_name
                pred_name = temp
        if len(og_pred_index_list)==0:
            continue
        if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
            og_db_index = cleaned_unmatched_db_list.index(db_name)
        for i in df_data:
            for og_pred_index in og_pred_index_list:
                if i['pred_fund']==unmatched_pred_list[og_pred_index]:
                    if og_db_index!=-1:
                        i['db_fund']=unmatched_db_list[og_db_index]
                        i['cleaned_db_fund_name'] = db_name
                        final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
                    else:
                        unmantched_pred_index_list.append(og_pred_index)
                        i['db_fund'] = ''
                        i['cleaned_db_fund_name'] = ''
                        if record_empty:
                            final_result.update({unmatched_pred_list[og_pred_index]:""})
                    i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
                    i['llm_clean_db_list'] = cleaned_unmatched_db_list,
                    i['llm_pred_fund'] = pred_name
                    i['llm_matched_db_name'] = db_name
                    i['llm_result'] = llm_result
                    break
    return unmantched_pred_index_list
 def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
    result = api_response['data']
    doc_fund_names = [item['fund_name'] for item in result]
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
--- a/core/data_mapping.py
+++ b/core/data_mapping.py
@ -228,7 +228,178 @@ class DataMapping:
                            mapped_data["similarity"] = 1
        self.output_mapping_file(mapped_data_list)
-        return mapped_data_list
+        
        if self.doc_source == "aus_prospectus":
            output_data_folder_splits = self.output_data_excel_folder.split("output")
            if len(output_data_folder_splits) == 2:
                merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
                os.makedirs(merged_data_folder, exist_ok=True)
                merged_data_json_folder = os.path.join(merged_data_folder, "json/")
                os.makedirs(merged_data_json_folder, exist_ok=True)
                merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
                os.makedirs(merged_data_excel_folder, exist_ok=True)
                merged_data_list = self.merge_output_data_aus_prospectus(mapped_data_list,
                                                                         merged_data_json_folder,
                                                                         merged_data_excel_folder)
                return merged_data_list
        else:        
            return mapped_data_list
    def merge_output_data_aus_prospectus(self,
                                         mapped_data_list: list, 
                                         merged_data_json_folder: str,
                                         merged_data_excel_folder: str):
        # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
        if mapped_data_list is None or len(mapped_data_list) == 0:
            return
        if merged_data_json_folder is None or len(merged_data_json_folder) == 0:
            return
        if merged_data_excel_folder is None or len(merged_data_excel_folder) == 0:
            return    
        mapping_data_df = pd.DataFrame(mapped_data_list)
        mapping_data_df.reset_index(drop=True, inplace=True)
        mapping_data_df.fillna("", inplace=True)
        document_mapping_df = self.document_mapping_info_df
        document_mapping_df.fillna("", inplace=True)
        datapoint_keyword_config_file = (
            f"./configuration/{self.doc_source}/datapoint_name.json"
        )
        with open(datapoint_keyword_config_file, "r", encoding="utf-8") as f:
            datapoint_keyword_config = json.load(f)
        datapoint_name_list = list(datapoint_keyword_config.keys())
        total_data_list = []
        doc_date = str(document_mapping_df["EffectiveDate"].values[0])[0:10]
        share_doc_data_df = mapping_data_df[(mapping_data_df["investment_type"] == 1)]
        exist_raw_name_list = []
        for index, row in share_doc_data_df.iterrows():
            doc_id = str(row["doc_id"])
            page_index = int(row["page_index"])
            raw_fund_name = str(row["raw_fund_name"])
            raw_share_name = str(row["raw_share_name"])
            raw_name = str(row["raw_name"])
            datapoint = str(row["datapoint"])
            value = row["value"]
            investment_type = row["investment_type"]
            share_class_id = row["investment_id"]
            share_class_legal_name = row["investment_name"]
            fund_id = ""
            fund_legal_name = ""
            if share_class_id != "":
                record_row = document_mapping_df[document_mapping_df["SecId"] == share_class_id]
                if len(record_row) > 0:
                    fund_id = record_row["FundId"].values[0]
                    fund_legal_name = record_row["FundName"].values[0]
            exist = False
            for exist_raw_name_info in exist_raw_name_list:
                exist_raw_name = exist_raw_name_info["raw_name"]
                exist_investment_type = exist_raw_name_info["investment_type"]
                exist_investment_id = exist_raw_name_info["investment_id"]
                if (
                    exist_raw_name == raw_name
                    and exist_investment_type == investment_type
                ) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id):
                    exist = True
                    break
            if not exist:
                data = {
                    "doc_id": doc_id,
                    "effective_date": doc_date,
                    "raw_fund_name": raw_fund_name,
                    "raw_share_name": raw_share_name,
                    "raw_name": raw_name,
                    "fund_id": fund_id,
                    "fund_name": fund_legal_name,
                    "sec_id": share_class_id,
                    "sec_name": share_class_legal_name,
                    "page_index": [],
                }
                for datapoint_name in datapoint_name_list:
                    data[datapoint_name] = ""
                exist_raw_name_list.append(
                    {"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id}
                )
                total_data_list.append(data)
            # find data from total_data_list by raw_name
            for data in total_data_list:
                if data["raw_name"] == raw_name:
                    update_key = datapoint
                    data[update_key] = value
                    if page_index not in data["page_index"]:
                        data["page_index"].append(page_index)
                    break
                if len(share_class_id) > 0 and data["sec_id"] == share_class_id:
                    update_key = datapoint
                    if len(str(data[update_key])) == 0:
                        data[update_key] = value
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                    break
        fund_doc_data_df = mapping_data_df[(mapping_data_df["investment_type"] == 33)]
        fund_doc_data_df.fillna("", inplace=True)
        for index, row in fund_doc_data_df.iterrows():
            doc_id = str(row["doc_id"])
            page_index = int(row["page_index"])
            raw_fund_name = str(row["raw_fund_name"])
            raw_share_name = ""
            raw_name = str(row["raw_name"])
            datapoint = str(row["datapoint"])
            value = row["value"]
            fund_id = row["investment_id"]
            fund_legal_name = row["investment_name"]
            exist = False
            if fund_id != "":
                for data in total_data_list:
                    if (fund_id != "" and data["fund_id"] == fund_id) or (
                        data["raw_fund_name"] == raw_fund_name
                    ):
                        update_key = datapoint
                        data[update_key] = value
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                        exist = True
            else:
                for data in total_data_list:
                    if data["raw_name"] == raw_name:
                        update_key = datapoint
                        data[update_key] = value
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                        exist = True
            if not exist:
                data = {
                    "doc_id": doc_id,
                    "effective_date": doc_date,
                    "raw_fund_name": raw_fund_name,
                    "raw_share_name": "",
                    "raw_name": raw_name,
                    "fund_id": fund_id,
                    "fund_name": fund_legal_name,
                    "sec_id": "",
                    "sec_name": "",
                    "page_index": [page_index],
                }
                for datapoint_name in datapoint_name_list:
                    data[datapoint_name] = ""
                data[datapoint] = value
                total_data_list.append(data)
        total_data_df = pd.DataFrame(total_data_list)
        total_data_df.fillna("", inplace=True)
        merged_data_excel_file = os.path.join(merged_data_excel_folder, f"merged_{self.doc_id}.xlsx")
        with pd.ExcelWriter(merged_data_excel_file) as writer:
            total_data_df.to_excel(writer, index=False, sheet_name="merged_data")
        merged_data_json_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
        with open(merged_data_json_file, "w", encoding="utf-8") as f:
            json.dump(total_data_list, f, ensure_ascii=False, indent=4)
        return total_data_list
    def get_raw_name_db_match_result(
        self, raw_name_list, investment_type: str, iter_count: int = 30
--- a/core/page_filter.py
+++ b/core/page_filter.py
@ -15,7 +15,6 @@ class FilterPages:
        self, 
        doc_id: str, 
        pdf_file: str, 
        document_mapping_info_df: pd.DataFrame, 
        doc_source: str = "emea_ar",
        output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
    ) -> None:
@ -23,6 +22,7 @@ class FilterPages:
        self.pdf_file = pdf_file
        self.output_pdf_text_folder = output_pdf_text_folder
        self.configuration_folder = f"./configuration/{doc_source}/"
        self.doc_source = doc_source
        misc_config_file = os.path.join(self.configuration_folder, "misc_config.json")
        if os.path.exists(misc_config_file):
            with open(misc_config_file, "r", encoding="utf-8") as file:
@ -31,10 +31,7 @@ class FilterPages:
        else:
            self.apply_pdf2html = False
        self.page_text_dict = self.get_pdf_page_text_dict()
-        if document_mapping_info_df is None or len(document_mapping_info_df) == 0:
+
            self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
        else:
            self.document_mapping_info_df = document_mapping_info_df
        self.get_configuration_from_file()
        self.doc_info = self.get_doc_info()
        self.datapoint_config, self.datapoint_exclude_config = (
@ -119,6 +116,7 @@ class FilterPages:
        domicile_datapoint_config_file = os.path.join(self.configuration_folder, "domicile_datapoints.json")
        datapoint_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_keyword.json")
        datapoint_exclude_keywords_config_file = os.path.join(self.configuration_folder, "datapoint_exclude_keyword.json")
        datapoint_type_config_file = os.path.join(self.configuration_folder, "datapoint_type.json")
        with open(language_config_file, "r", encoding="utf-8") as file:
            self.language_config = json.load(file)
@ -130,9 +128,13 @@ class FilterPages:
            datapoint_exclude_keywords_config_file, "r", encoding="utf-8"
        ) as file:
            self.datapoint_exclude_keywords_config = json.load(file)
        with open(
            datapoint_type_config_file, "r", encoding="utf-8"
        ) as file:
            self.datapoint_type_config = json.load(file)
    def get_doc_info(self) -> dict:
-        if len(self.document_mapping_info_df) == 0:
+        if self.doc_source == "emea_ar":
            return {
                "effective_date": None,
                "document_type": "ar",
@ -140,22 +142,16 @@ class FilterPages:
                "language": "english",
                "domicile": "LUX",
            }
-        effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
+        elif self.doc_source == "aus_prospectus":
-        document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
+            return {
-        if document_type in [4, 5]:
+                "effective_date": None,
-            document_type = "ar"
+                "document_type": "prospectus",
-        elif document_type == 1:
+                "language_id": "0L00000122",
-            document_type = "prospectus"
+                "language": "english",
-        language_id = self.document_mapping_info_df["Language"].iloc[0]
+                "domicile": "AUS",
-        language = self.language_config.get(language_id, None)
+            }
-        domicile = self.document_mapping_info_df["Domicile"].iloc[0]
+        else:
-        return {
+            raise ValueError(f"Invalid doc_source: {self.doc_source}")
            "effective_date": effective_date,
            "document_type": document_type,
            "language_id": language_id,
            "language": language,
            "domicile": domicile,
        }
    def get_datapoint_config(self) -> dict:
        domicile = self.doc_info.get("domicile", None)
@ -224,7 +220,8 @@ class FilterPages:
            if page_index < 2:
                continue
            page_num = page_index + 1
-            if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and page_num not in self.document_dp_pages:
+            if self.document_dp_pages is not None and len(self.document_dp_pages) > 0 and \
                page_num not in self.document_dp_pages:
                continue
            page_text = clean_text(page_text)
@ -237,7 +234,8 @@ class FilterPages:
            language = self.doc_info.get("language", None)
            if language is None:
                language = "english"
-            if language == "english" and re.search(self.percentage_regex, text) is None:
+            if self.doc_source == "emea_ar" and language == "english" and \
                re.search(self.percentage_regex, text) is None:
                continue
            for datapoint, keywords in self.datapoint_config.items():
                find_datapoint = False
@ -257,10 +255,12 @@ class FilterPages:
                                    break
                            if need_exclude:
                                continue
-
+                        is_valid = True
-                        is_valid = self.search_in_sentence_is_valid(search_text, text)
+                        data_type = self.datapoint_type_config.get(datapoint, "float")
-                        if not is_valid:
+                        if data_type == "float":
-                            continue
+                            is_valid = self.search_in_sentence_is_valid(search_text, text)
                            if not is_valid:
                                continue
                        result[datapoint].append(page_index)
                        detail = {
                            "doc_id": self.doc_id,
@ -292,28 +292,32 @@ class FilterPages:
                for split in search_text_split:
                    if split[0].islower():
                        lower_word_count += 1
-                if lower_word_count < lower_word_count_threshold:
+                if self.doc_source == "emea_ar" and \
-                    if re.search(self.percentage_regex, search_text) is not None:
+                    lower_word_count > lower_word_count_threshold:
-                        is_valid = True
+                        is_valid = False
                        break
                    new_search_text_regex = add_slash_to_text_as_regex(search_text)
                    new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
                        new_search_text_regex
                    )
                    new_search = re.search(new_search_regex, text, re.IGNORECASE)
                    if new_search is not None:
                        next_line = new_search.group("next_line").strip()
                        next_2_line = new_search.group("next_2_line").strip()
-                        if re.search(big_number_regex, next_line) is not None or \
+                if re.search(self.percentage_regex, search_text) is not None:
-                            re.search(big_number_regex, next_2_line) is not None:
+                    is_valid = True
-                            is_valid = False
+                    break
-                        else:
+                new_search_text_regex = add_slash_to_text_as_regex(search_text)
-                            is_valid = True
+                new_search_regex = r"\n.*{0}.*\n(?P<next_line>.*)\n(?P<next_2_line>.*)\n".format(
-                            break
+                    new_search_text_regex
                )
                new_search = re.search(new_search_regex, text, re.IGNORECASE)
                if new_search is not None:
                    next_line = new_search.group("next_line").strip()
                    next_2_line = new_search.group("next_2_line").strip()
                    if re.search(big_number_regex, next_line) is not None or \
                        re.search(big_number_regex, next_2_line) is not None:
                        is_valid = False
                    else:
                        is_valid = True
                        break
                else:
                    is_valid = True
                    break
        return is_valid
    def search_keyword(self, text: str, keyword: str):
--- a/instructions/aus_prospectus/compare_table_structure_prompts.json
+++ b/instructions/aus_prospectus/compare_table_structure_prompts.json
@ -0,0 +1,9 @@
 {
 	"prompts": [
        "Assume there is a data table in current page contents, is there the table with same table structure in the next page contents?", 
        "The meaning of \"same\" is: with totally same table columns for the table in both of current page and next page.",
        "Please output JSON format, the format example is:",
        "{\"answer\": \"Yes\"} or {\"answer\": \"No\"}",
        "Answer:\n"
    ]
 }
--- a/instructions/aus_prospectus/data_extraction_prompts_config.json
+++ b/instructions/aus_prospectus/data_extraction_prompts_config.json
@ -16,9 +16,9 @@
 	],
 	"data_business_features": {
 		"common": [
-			"General rules:",
+			"## General rules",
-			"- 1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.",
+			"1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.",
-			"- 2. Fund name: ",
+			"2. Fund name: ",
 			"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", 
 			"b. The sub-fund name may be as the first column or first row values in the table.",
 			"b.1 fund name example:",
@ -30,13 +30,10 @@
 			"c. If with multiple fund names in context, please retrieve the fund name closest above the numerical value.",
 			"c.1 fund name example:",
 			"---- Example Start ----",
-			"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nƒ$GGLWLRQDO8QDXGLWHG$SSHQGL[$118$/5(3257$;$:RUOG)XQGV\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26%  \n1.29%",
+			"AXA World Funds ACT Emerging Markets Bonds\nAXA World Funds \n \nAdditional Unaudited Appendix \n\nExpense Ratios (continued) \n \nCalculated TER (1) \nSwiss method \nApplied\nService Fee (2)\nOngoing \nCharges (3) \n \nwith performance \nfees \nwithout performance \nfees \n \nAXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon \nA Capitalisation CHF Hedged \n1.26% \n1.26% \n0.26%  \n1.29%",
 			"---- Example End ----",
 			"Correct fund name: AXA World Funds - ACT Emerging Markets Short Duration Bonds Low Carbon",
 			"\n",
 			"- 3. Only extract the latest data from context:",
 			"If with multiple data values in same row, please extract the latest.",
 			"\n",
 			"d. Some table format, the fund name is in the end of row, please extract the fund name from the end of row.",
 			"---Example Start---",
 			"\nTotal\nTransaction Costs\nPerformance Fees\nManagement fees and costs\nIndirect Fee\nManagement fees\nMLC diversified investment\noption\n1.49% p.a.\n0.01% p.a.\n0.06% p.a.\n0.07% p.a.\n1.35% p.a.\nMLC Horizon 2\nIncome Portfolio\n",
@ -51,25 +48,56 @@
 			"---Example End---",
 			"Correct fund name: MLC Horizon 2 Income Portfolio",
 			"Correct share name: MLC Horizon 2 Income Portfolio",
-			"- 4. Reported names:",
+			"f. In table header, \"Retirement account\" or \"Account-based pension\" means \"Pension\"; ", 
 			"\"Transition to Retirement account\" or \"Pre-retirement pension\" means \"TTR\". ",
 			"Please append them to the fund name and share name.",
 			"f.1 Example 1",
 			"---Example 1 Start---",
 			"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n",
 			"---Example 1 End---",
 			"The prefix is \"Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Retirement account Cash\".",
 			"f.2 Example 2",
 			"---Example 2 Start---",
 			"Transition to Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n",
 			"---Example 2 End---",
 			"The prefix is \"Transition to Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Transition to Retirement account Cash\".",
 			"f.3 Example 3",
 			"---Example 3 Start---",
 			"Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Fund’s reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6",
 			"---Example 3 End---",
 			"Although exist \"Retirement account\" and \"Transition to Retirement account\", but the investment option is not exist, so fund name and share name should be: \"Rest Pension\".",
 			"\n",
 			"3. Only extract the latest data from context:",
 			"If with multiple data values in same row, please extract the latest.",
 			"\n",
 			"4. Reported names:",
 			"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
 			"Only output the values which with significant reported names.",
-			"- Multiple data columns with same reported name but different post-fix:",
+			"Multiple data columns with same reported name but different post-fix:",
 			"If there are multiple reported names with different post-fix text, here is the priority rule:",
 			"The pos-fix text is in the brackets: (gross), (net), pick up the values from (net).",
 			"---Example Start---",
 			"\n Investment option \nInvestment option \nmanagement \ncosts1  \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2  \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net)  \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross)  \n% p.a. \n(A + B)\nTotal Management \nfees and costs  \n(net)  \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
 			"---Example End---",
 			"The output should be:",
-			"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
+			"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
-			"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
+			"5. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
-			"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
+			"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\".",
 			"6. Identify the value of data point and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0",
 			"---Example Start---",
 			"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n",
 			"---Example End---",
 			"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
 			"The fund name prefix is \"Retirement account\", the investment option is \"Balanced - Indexed\", so fund name and share name should be: \"Retirement account Balanced - Indexed\".",
 			"The output should be:",
 			"{\"data\": [{\"fund name\": \"Retirement account Balanced - Indexed\", \"share name\": \"Retirement account Balanced - Indexed\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"performance_fee_costs\": 0}]}",
 			"7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
 		],
 		"investment_level": {
 			"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
 			"management_fee_and_costs": "Management fee and costs is share level data.",
 			"management_fee": "Management fee is share level data.",
 			"performance_fee_costs": "Performance fee costs is share class level data.",
 			"performance_fee": "Performance fees is share class level data.",
 			"buy_spread": "Buy spread is share class level data.",
 			"sell_spread": "Sell spread is share class level data.",
 			"establishment_fee": "Establishment fee is share class level data.",
@ -87,7 +115,7 @@
 			"date_of_last_hwm_reset": "Date of last hwm reset is share class level data.",
 			"date_of_last_performance_fee_restructure": "Date of last performance fee restructure is share class level data.",
 			"high_water_mark_type": "High water mark type is share class level data.",
-			"minimum_initial_investment": "Minimum initial investment is share class level data.",
+			"minimum_initial_investment": "Minimum initial investment is fund level data.",
 			"recoverable_expenses": "Recoverable expenses is share class level data.",
 			"indirect_costs": "Indirect costs is share class level data."
 		},
@ -95,8 +123,7 @@
 			"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
 			"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
 			"management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
-			"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.",
+			"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
 			"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.",
 			"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
 			"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
 			"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
@ -115,42 +142,763 @@
 			"date_of_last_performance_fee_restructure": "Date of last performance fee restructure is belong to date, the value should be date format. e.g. 12 August 2022",
 			"high_water_mark_type": "High water mark type is belong to text, the value should be text.",
 			"minimum_initial_investment": "Minimum initial investment is belong to decimal number, the value could be more than 100, e.g. 625.00",
 			"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100.",
 			"recoverable_expenses": "Recoverable expenses is belong to percentage number, the value should be less than 100.",
-			"indirect_costs": "Indirect costs is belong to percentage number, the value should be less than 100."
+			"change_recoverable_expenses": "Change recoverable expenses is belong to percentage number, the value should be less than 100."
 		},
 		"special_rule": {
 			"management_fee_and_costs": [
-				"If there are multiple Management fee and costs reported names, here is the priority rule:",
+				"### Management fee and cost",
-				"A. With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
+				"Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.",
-				"---Example Start---",
+				"A. If there are multiple Management fee and costs reported names, here are the priority rules:",
 				"A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
 				"---Example 1 Start---",
 				"\n Investment option \nInvestment option \nmanagement \ncosts1  \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2  \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net)  \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross)  \n% p.a. \n(A + B)\nTotal Management \nfees and costs  \n(net)  \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
 				"\n",
 				"A.2 The data value with gross and net, please ignore gross value, output the net value only.",
 				"---Example Start---",
 				"Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n",
 				"---Example End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]",
+				"{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]}",
 				"\n",
-				"If there are multiple Management fee and costs sub-columns, here is the rule:",
+				"If the context only mentions the gross value or gross asset value or GAV, please ignore the gross value as the management_fee_and_costs and management_fee value, just output empty.",
-				"B. With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
+				"---Example Start---",
 				"Fees and other costs (cont’d) \n\n08 \n\nType of fee or cost Amount 2 How and when paid \nMANAGEMENT COSTS \nThe fees and costs for \nmanaging your investment 1 \nEstimated at 0.75625% of gross \nasset value (GAV) per annum, \ncomprising: \nThe base management fee is payable from \nthe income and assets of the Fund to the \nInvestment Manager half-yearly in arrears \nBase Management Fee \n0.50% per annum of the Average \nGAV 3 \nAnd \nExpense Recovery Costs \n0.25625% (estimated) per annum \nof GAV in other fees, expenses \nand indirect costs.",
 				"---Example End---",
 				"The output should be:",
 				"{\"data\": []}",
 				"\n",
 				"B. The table title is with Ongoing annual fees and costs.",
 				"B.1 Management fees and costs should not include transaction costs and performance fees.",
 				"---Example Start",
 				"Ongoing annual\nfees and costs\nC Class and E Class -P Class - Performance \nStandard Fee Option Fee Option \n36 \n(E Class is closed to \nnew investors) \nPlatinum International Fund 1.56% p.a. 1.46% p.a.  \nOngoing annual fees and costs include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees (for P Class – Performance Fee Option \nonly). Please see page 36 for further information.",
 				"---Example End",
 				"The values 1.56 and 1.46 include estimated management fees and costs, estimated \ntransaction costs and estimated performance fees, should ignore them.",
 				"The output should be:",
 				"{\"data\": []}",
 				"B.2 If with pure management fees and costs in table, please extract relevant values",
 				"---Example Start---",
 				"Fees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nManagement fees and costs \nEstimated management fees and costs \nper annum are: \nPlatinum International Fund 1.41% 1.16%\nPlatinum Global Fund (Long Only) 1.35% 1.10%\n",
 				"---Example End---",
 				"a. For this example, there is pure \"Management fees and costs\", please extract relevant values.",
 				"b. This example mentioned share classes, please output according to share class.",
 				"The output should be",
 				"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.41, \"management_fee\": 1.41}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.16, \"management_fee\": 1.16}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.35}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"management_fee_and_costs\": 1.1, \"management_fee\": 1.1}]}",
 				"\n",
 				"C. If there are multiple Management fee and costs sub-columns, here is the rule: ", 
 				"C.1 With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
 				"---Example Start---",
 				"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
 				"---Example End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]",
+				"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee_costs\": 0.06}]}",
 				"\n",
-				"C. If only find \"Management fees and costs\", please output the relevant as data point key: \"management_fee_and_costs\", instead of \"management_fee\".",
+				"C.2 With \"Total management cost (% pa)\" = \"Management fee (% pa)\" + \"Estimated other indirect costs\" + \"Estimated expense recoveries\" + \"Estimated Regulatory Change Expense Recovery\".",
 				"The management_fee is the value of \"Management fee (% pa)\".",
 				"The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
 				"---Example 1 Start---",
 				"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.00 0.04 0.00 0.01 1.38 0.31\n1.29 0.00 0.00 0.00 0.01 1.30 0.29\n",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
 				"---Example 2 Start---",
 				"\nFund name \nManagement fee \nRecoverable expenses \nEstimated performance-related fee \nEstimated other indirect costs \nPathways 30 \n1.16% pa \n0.02% pa \n0.05% pa \n0.05% pa \nPathways 70 \n1.30% pa \n0.01% pa \n0.06% pa \n0.04% pa \n",
 				"---Example 2 End---",
 				"The management_fee_and_costs is the value of \"Management fee\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
 				"The management_fee is the value of \"Management fee\".",
 				"The performance_fee_costs is the value of \"Estimated performance-related fee\".",
 				"The indirect_costs is the value of \"Estimated other indirect costs\".",
 				"The recoverable_expenses is the value of \"Recoverable expenses\".",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Pathways 30\", \"share name\": \"Pathways 30\", \"management_fee_and_costs\": 1.23, \"management_fee\": 1.16, \"recoverable_expenses\": 0.02, \"performance_fee_costs\": 0.05, \"indirect_costs\": 0.05}, {\"fund name\": \"Pathways 70\", \"share name\": \"Pathways 70\", \"management_fee_and_costs\": 1.35, \"management_fee\": 1.3, \"recoverable_expenses\": 0.01, \"performance_fee_costs\": 0.06, \"indirect_costs\": 0.04}]}",
 				"\n",
 				"D. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
 				"---Example 1 Start---",
 				"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
 				"---Example 1 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}]",
+				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]}",
 				"---Example 2 Start---",
 				"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
 				"---Example 2 End---",
 				"The output should be:",
-				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21}]"
+				"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
 				"---Example 3 Start---",
 				"Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.20–0.50 5 \n\n",
 				"---Example 3 End---",
 				"For value: 1.04%(g)/2.18%(n), (g) means gross, (n) means net, please extract net value: 2.18",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}",
 				"\n",
 				"E. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
 				"---Example 1 Start---",
 				"Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
 				"\n",
 				"F. If columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\" appear, please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\" for EACH SPECIFIC investment option. ", 
 				"DO NOT assume these values apply to other investment options mentioned elsewhere in the context or from provided examples.",
 				"---Example 1 Start---",
 				"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
 				"---Example 1 End---",
 				"For this example, please ignore the \"Total investment fees and costs\" and \"Transaction costs\" columns, ", 
 				"just output the values from \"Investment fees and costs (excl Performance Fees)\" as management_fee and management_fee_and_costs, ", 
 				"output the values from \"Performance Fee\" as performance_fee_costs.",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee_costs\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee_costs\": 0.18}]}",
 				"\n",
 				"G. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00% or 0.55%-1.00%, please ignore and output empty.",
 				"---Example 1 Start---",
 				"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
 				"---Example 1 End---",
 				"The relevant values: 0.26 and 1.82, are in the range, so the output should be:",
 				"{\"data\": []}",
 				"---Example 2 Start---",
 				"Investment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \n",
 				"---Example 2 End---",
 				"The relevant values: 0.07 and 1.00, are in the range, should ignore, so the output should be:",
 				"{\"data\": []}",
 				"---Example 3 Start---",
 				"Management fees and costs \n0.67–1.17% p.a. (estimated) \nThe fees and costs for \nmanaging your investment \n",
 				"---Example 3 End---",
 				"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
 				"{\"data\": []}",
 				"---Example 4 Start---",
 				"Type of fee or cost Amount 2 How and when paid \nOngoing annual fees and costs 3 \nManagement fees and costs \n0.82% to 1.22% p.a. (estimated) \n",
 				"---Example 4 End---",
 				"The relevant values: 0.82 and 1.22, are in the range, should ignore, so the output should be:",
 				"{\"data\": []}",
 				"\n",
 				"H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
 				"---Example 1 Start---",
 				"MANAGEMENT COSTS AND TRANSACTION COSTS \n\nOption name Management costs \nEstimated \nperformance \nfee (pa) 1 \nTotal management\ncosts (including\nestimated performance\nfee) pa\nTransaction costs \nper transaction (%) \nMULTI-MANAGER MULTI-SECTOR (These investment options are located in the ‘Investment Options Menu’ on pages 18 to 19.) \nFirstChoice Wholesale Defensive 0.85% 0.85% 0.15\nFirstChoice Wholesale Conservative 0.90% 0.02%1 0.92% 1 0.15 \n",
 				"---Example 1 End---",
 				"The column: \"Total management costs (including estimated performance fee) pa\" is the sum of \"Management costs\" and \"Estimated performance fee (pa)\", we should ignore the \"Estimated performance fee (pa)\" value, just output the \"Management costs\" value.",
 				"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
 				"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}",
 				"---Example 2 Start---",
 				"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCapital Stable 0.46% 0.04% 0.08% 0.54%\nBalanced 0.52% 0.06% 0.10%0.62% \n",
 				"---Example 2 End",
 				"The column: \"(A) Investment fees and costs (including (B) performance fees) (pa)*\" includes \"(B) performance fees) (pa)*\", we should subtract the \"(B) performance fees) (pa)*\" value, just output the pure management fee and costs value.",
 				"Besides, the \"Retirement account\" is the pre-fix fund name, should output it with fund/ share name together, e.g. \"Retirement account Capital Stable\"",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Retirement account Capital Stable\", \"share name\": \"Retirement account Capital Stable\", \"management_fee_and_costs\": 0.42, \"management_fee\": 0.42, \"performance_fee_costs\": 0.04}, {\"fund name\": \"Retirement account Balanced\", \"share name\": \"Retirement account Balanced\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.46, \"performance_fee_costs\": 0.06}]}",
 				"---Example 3 Start---",
 				"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
 				"---Example 3 End---",
 				"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
 				"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
 				"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.",
 				"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.",
 				"So the output should be:",
 				"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}",
 				"\n",
 				"I. If exist **\"Maximum management fee\"** in context, please ignore relevant values.",
 				"---Example Start---",
 				"Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%",
 				"---Example End---",
 				"The values in example is **Maximum management fee**, should ignore all of them.",
 				"The Output should be:",
 				"{\"data\": []}",
 				"J. The management fee and costs in paragraph with speficic fund/ share prefix name: \"Account-based pension\" or \"Pre-retirement pension\"",
 				"---Example 1 Start---",
 				"Account-based pension \nInvestment fees \nand costs 2 \nHigh Growth 0.45%, Growth 0.49%",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Account-based pension High Growth\", \"share name\": \"Account-based pension High Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45}, {\"fund name\": \"Account-based pension Growth\", \"share name\": \"Account-based pension Growth\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]}",
 				"---Example 2 Start---",
 				"Pre-retirement pension \nWe generally calculate \nand deduct this fee daily when unit \nprices are determined. \nHigh Growth 0.48%, Growth 0.50%",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}",
 				"K. DO NOT extract management fees from \"Cost of product\" summaries. ", 
 				"\"Cost of product\" figures should not be treated as 'Investment fees and costs'.",
 				"---Example Start---",
 				"Investment option Cost of product \nCash $141.00",
 				"---Example End---",
 				"FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!! JUST RETURN EMPTY RESPONSE!!!",
 				"The output should be:",
 				"{\"data\": []}",
 				"L. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option.",
 				"M. Identify the value of management fee and costs, and if it is written 0% or 0.00% or 0 or 0.00, then extract the same as 0, please don't ignore it."
 			],
 			"administration_fees":[
 				"### Administration fees and costs",
 				"Administration fees and costs and total annual dollar-based charges are share class level data.",
 				"Simple case:",
 				"----Example 1 Start----",
 				"Fees and costs summary \n\nVision income streams \n\nType of fee Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n2 \n0.25% pa of your account balance (made up of \n0.25% of your account balance which is capped \nat $1,050 pa plus a reserving margin of 0.00% \npa of each investment option’s assets).",
 				"----Example 1 End----",
 				"According to example, the administration fee is 0.25% pa, so administration_fees is 0.25, ",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Vision income streams\", \"share name\": \"Vision income streams\", \"administration_fees\": 0.25}]}",
 				"\n",
 				"----Example 2 Start----",
 				"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
 				"----Example 2 End----",
 				"According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ",
 				"total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}",
 				"\n",
 				"----Example 3 Start----",
 				"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply)",
 				"----Example 3 End----",
 				"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
 				"total_annual_dollar_based_charges is 1 * 52 = 52",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
 				"\n",
 				"---Example 4 Start---",
 				"\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n",
 				"---Example 4 End---",
 				"According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ",
 				"total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}",
 				"\n",
 				"---Example 5 Start---",
 				"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouse’s \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the ‘Fees and \nother costs’ section on \npages 40-46 for details \n",
 				"---Example 5 End---",
 				"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
 				"total_annual_dollar_based_charges is 1 * 52 = 52",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
 				"---Example 6 Start---",
 				"Administration \nfees and costs \n1 \nFirstChoice Lifestage (MySuper product) \nand Select investment options \n(other than FirstRate Saver) \n0.04% p.a. \nThe percentage‑based administration fee is reflected in \nthe daily unit price of your investment option and payable \nmonthly or as incurred by the option. \nFirstRate Saver \nFrom 0.35% to \n0.50% p.a. \nThe dollar‑based administration fee of $5 per month is \npayable at the beginning of each month by deduction of \nunits from one of your options. \nDollar-based fee discounts \nThe current fee for FirstRate Saver is set out at \ncfs.com.au/personal/resources/funds-and-performance/ \nfirstrate‑interest‑rates.html \nYour employer may be able to negotiate a lower dollar‑ \nbased administration fee for employee members. \nplus \nDollar-based administration fee \nRetained benefit and spouse members are not entitled \nto this discount. \n$60 p.a. ($5 per month) per account \n",
 				"---Example 6 Start---",
 				"According to example, the administration fee is 0.04, ",
 				"\"From 0.35% to 0.50% p.a.\", because it is the range value, need ignore and exclude, so administration_fees is 0.04, ",
 				"the total_annual_dollar_based_charges is 60 (5 per month * 12)",
 				"About fund name, it should be \"FirstChoice Lifestage\".",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"FirstChoice Lifestage\", \"share name\": \"FirstChoice Lifestage\", \"administration_fees\": 0.04, \"total_annual_dollar_based_charges\": 60}]}",
 				"\n",
 				"Complex cases:",
 				"A. Need to add multiple numbers together.",
 				"---Example 1 Start---",
 				"MLC MasterKey Super & Pension Fundamentals \n\nType of fee or cost \nOngoing annual fees and costs 1 \n\nAdministration fees and \ncosts \n\nAccount balance \n\nFirst $150,000 \n\nRemaining balance \nover $150,000 \n\nThe percentage Administration fee \ncharged to each account you have \n(excluding the fixed fee and Trustee \nLevy) is capped at $2,500 pa. \n\nPlus \n\nTrustee Levy of 0.02% pa of your \naccount balance. \n\nPlus \n\nAmount \n\nHow and when paid \n\nPercentage fee \n(% pa) \n\n0.30 \n\n0.10 \n\nAdministration fee \n\nThe Administration fee is deducted monthly from your account and will \nbe rounded off to 2 decimal points. As a result of the rounding, the total \nannual amount may slightly differ. \n\nThe percentage fee for each month is calculated using your average Super \nand Pension account balance for the previous month. \n\nThe Trustee Levy will be deducted monthly from your account balance. \n\nThe levy amount for each month is calculated using your account balance \nat the date it's deducted. \n\nYou won't see these costs as direct charges to your account. They reduce \nthe balance held in reserves used to cover certain costs related to the \nrunning of the MLC Super Fund. \n\n4 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement",
 				"---Example 1 End---",
 				"According to example, the relevant values: first: 0.30%, remaining balance over: 0.10%, Plus Trustee Levy: 0.02%.",
 				"Please ignore the remaining balance over 0.10%, add first: 0.30% and Plus Trustee Levy: 0.02% = 0.32%", 
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}",
 				"---Example 2 Start---",
 				"Mine Super\nType of fee or cost Amount (% pa) How and when paid \nOngoing annual fees and costs \n1 \nWe generally calculate and \ndeduct this fee daily when unit \nprices are determined. \nAdministration fees \nand costs \n0.16% pa \nPlus \n0.031% pa. \n",
 				"---Example 2 End---",
 				"According to example, the relevant values: 0.16% and 0.031%, so administration_fees is 0.16 + 0.031 = 0.191",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Mine Super\", \"share name\": \"Mine Super\", \"administration_fees\": 0.191}]}",
 				"---Example 3 Start---",
 				"Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Fund’s reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6",
 				"---Example 3 End---",
 				"According to the example, the administration fee is $1.50 per week plus 0.10% pa, Administration costs is 0.09% pa so administration_fees is 0.1 + 0.09 = 0.19, ",
 				"total_annual_dollar_based_charges is 1.50 * 52 = 78",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Rest Pension\", \"share name\": \"Rest Pension\", \"administration_fees\": 0.19, \"total_annual_dollar_based_charges\": 78}]}",
 				"---Example 4 Start---",
 				"Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n",
 				"---Example 4 End---",
 				"Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees, only output total_annual_dollar_based_charges as 78.",
 				"B. The administration fee and costs/ total annual dollar-based charges are with production name, other data points/ values are with specific fund/ share name(s).",
 				"---Example Start---",
 				"My Super \nType of fee or cost Amount How and when paid \nOngoing annual fees and costs 1 \nAdministration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a \nmaximum of $1,000 p.a.) \n$0.50 per week deducted from your account\nbalance at the end of each month or on exit.\nPercentage fee taken into account in the \ndaily calculation of unit prices. \nInvestment fees and costs \n2 \nOption % of option’s assets* \nFund1 0.12%\n",
 				"---Example End---",
 				"According to example, \"My Super\" is with \"Administration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a maximum of $1,000 p.a.) \n$0.50 per week deducted from your account balance at the end of each month or on exit.\"", 
 				"so administration_fees is 0.17, total_annual_dollar_based_charges is 0.50 * 52 = 26, with production name: \"My Super\".",
 				"\"Fund1\" is with specific fund/ share name, so management_fee_and_costs and management_fee are: 0.12",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"My Super\", \"share name\": \"My Super\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 26}, {\"fund name\": \"Fund1\", \"share name\": \"Fund1\", \"management_fee_and_costs\": 0.12, \"management_fee\": 0.12}]}"
 			],
 			"total_annual_dollar_based_charges": [
 				"### Total annual dollar-based charges",
 				"Total annual dollar-based charges are share class level data.",
 				"A. Its value corresponds to the administration fees and costs that are charged on a weekly basis.",
 				"----Example Start----",
 				"MLC MasterKey Super & Pension Fundamentals\nType of fee or cost \nOngoing annual fees and costs 1 \nAmount \nHow and when paid \nOther administration costs paid from \nreserves of 0.00% pa of your account \nbalance. \nPlus \nA fixed fee of $1.50 per week \nThis fee is deducted monthly if your account balance is below $50,000 \nwhen the percentage administration fee is deducted. \nInvestment fees and \ncosts 2 \nInvestment fees and estimated costs \nfor MLC Horizon 4 Balanced Portfolio, \n1.20% pa. \nYou won ’ t see these fees and costs as direct charges to your account. \nThey're reflected in the daily unit price of each investment option and will \nreduce the net return on your investment \nInvestment fees and estimated costs \nfor other investment options, ranges \nfrom 0.00% pa to 2.84% pa \n(estimated). \nTransaction costs \nMLC Horizon 4 Balanced Portfolio, \n0.06% pa (estimated). \nOther investment options, ranges \nfrom 0.00% pa to 0.24% pa \n(estimated). \nYou won ’ t see these costs as direct charges to your account. They're \nreflected in the daily unit price of each investment option and will reduce \nthe net return on your investment. \nMember activity related fees and costs \nBuy-sell spread \nYou won ’ t see this fee as a direct charge to your account. It ’ s reflected in \nthe buy and sell unit price of each investment option when there ’ s a \ntransaction on your account. \nMLC Horizon 4 Balanced Portfolio, \n0.10%/0.10% \nOther investment options, ranges \nfrom 0.00%/0.00% to 0.30%/0.30% \nThe current buy-sell spreads of an investment option are available at \nmlc.com.au/buysellspreads \n",
 				"----Example End----",
 				"According to example, the fixed fee is $1.50 per week, so total_annual_dollar_based_charges is 1.50 * 52 = 78",
 				"In the example, also with management fees and costs, management fee, buy_spread and sell_spread for specific fund: MLC Horizon 4 Balanced Portfolio.",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
 				"\n",
 				"B. Please identify some case which not belong to the total_annual_dollar_based_charges, and output empty.",
 				"----Example 1 Start----",
 				"Cost of product information \n\nCost of product for 1 year \n\nThe cost of product gives a summary calculation about \nhow ongoing annual fees and costs can affect your \nsuperannuation investment over a 1-year period for all \ninvestment options. It is calculated in the manner \nshown in the 'Example of annual fees and costs'. \n\nThe cost of product information assumes a balance of \n$50,000 at the beginning of the year. (Additional fees \nsuch as a buy/sell spread may apply – refer to the ‘Fees \nand costs summary’ table for the relevant investment \noption.) \n\nYou should use this figure to help compare \nsuperannuation products and investment options. \n\nInvestment option \nCash \nCost of product \nPerpetual Cash \n$60.00 \nFixed income and credit \nBentham Global Income \n$485.00 \n",
 				"----Example 1 End----",
 				"Explanation:",
 				"The values provided in the example are not total annual dollar-based charges; ", 
 				"they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ", 
 				"This figure includes ongoing annual fees and costs, but it may not encompass all possible charges, such as additional fees like buy/sell spreads. ", 
 				"Therefore, it serves as a comparative tool rather than a comprehensive total of all annual charges.",
 				"The output should be empty:",
 				"{\"data\": []}",
 				"----Example 2 Start----",
 				"Equals \nCost of product \n1 \nIf your balance was $50,000 at \nthe beginning of the year, then \nfor that year you will be charged \nfees and costs of $395 for the \nsuperannuation product. \n\n",
 				"----Example 2 End----",
 				"Explanation:",
 				"The values provided in the example are not total annual dollar-based charges; ",
 				"they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ",
 				"FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!!"
 			],
 			"buy_spread": [
 				"### Buy/sell spread",
 				"Buy/sell spread is share class level data.",
 				"A. Exclude reported name", 
 				"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
-				"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)"
+				"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ",
 				"Estimated transaction costs offset by buy/sell spreads (% pa), Transaction costs",
 				"---Example 1 Start---",
 				"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
 				"---Example 1 End---",
 				"The data should be excluded, the output should be:",
 				"{\"data\": []}",
 				"\n",
 				"---Example 2 Start---",
 				"Transaction costs \nRetirement and TTR income streams \n0.06% p.a. for Defensive Growth, 0.04% p.a. for International \nShares, 0.08% p.a. for Australian Shares, 0.19% p.a. for Property",
 				"---Example 2 End---",
 				"The data is about Transaction costs, should be excluded, the output for buy_spread and sell_spread should be:",
 				"{\"data\": []}",
 				"\n",
 				"---Example 3 Start---",
 				"Fund name \nCost of product \nCFS Index Australian Bond Fund \n$155 \n",
 				"---Example 3 End---",
 				"The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:",
 				"{\"data\": []}",
 				"\n",
 				"---Example 4 Start---",
 				"Transaction costs \nOption % of option’s assets* \nHigh Growth 0.03% \nTaken into account in the daily calculation\nof unit prices\nMember activity related fees and costs \nBuy-sell spread Nil N/A\nSwitching fee Nil N/A\n",
 				"---Example 4 End---",
 				"According to example, please exclude Transaction costs.", 
 				"\"Buy-sell spread\" data section is under \"Member activity related fees and costs\", the value is Nil, output for buy_spread and sell_spread should be:",
 				"{\"data\": []}",
 				"B. Simple case with simple table structure:",
 				"---Example 1 Start---",
 				"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
 				"\n",
 				"---Example 2 Start---",
 				"Fund name \nManagement fees \nand costs (% p.a.) \n1 \nTransaction costs \n(% p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nEveryday Investing Balanced Fund 0.35 0.05 0.00\n",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Everyday Investing Balanced Fund\", \"share name\": \"Everyday Investing Balanced Fund\", \"management_fee_and_costs\": 0.35, \"management_fee\": 0.35, \"buy_spread\": 0, \"sell_spread\": 0}]}",
 				"\n",
 				"---Example 3 Start---",
 				"Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nAUSTRALIAN SHARE \nFirst Sentier Australian Share Fund 0.96% 0.10\nFirst Sentier Concentrated Australian \nShare Fund 0.96% 0.10\nFirst Sentier Imputation Fund 0.97% 0.15\nAUSTRALIAN SHARE – SMALL COMPANIES \nFirst Sentier Australian Small \nCompanies Fund 1.12% 0.15\nGLOBAL SHARE \nStewart Investors Worldwide Leaders \nSustainability Fund 1.17% 0.10\nStewart Investors Worldwide Sustainability \nFund 1.02% 0.10\n",
 				"---Example 3 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"First Sentier Australian Share Fund\", \"share name\": \"First Sentier Australian Share Fund\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"First Sentier Concentrated Australian Share Fund\", \"share name\": \"First Sentier Concentrated Australian Share Fund\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"First Sentier Imputation Fund\", \"share name\": \"First Sentier Imputation Fund\", \"management_fee_and_costs\": 0.97, \"management_fee\": 0.97, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"First Sentier Australian Small Companies Fund\", \"share name\": \"First Sentier Australian Small Companies Fund\", \"management_fee_and_costs\": 1.12, \"management_fee\": 1.12, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"Stewart Investors Worldwide Leaders Sustainability Fund\", \"share name\": \"Stewart Investors Worldwide Leaders Sustainability Fund\", \"management_fee_and_costs\": 1.17, \"management_fee\": 1.17, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Stewart Investors Worldwide Sustainability Fund\", \"share name\": \"Stewart Investors Worldwide Sustainability Fund\", \"management_fee_and_costs\": 1.02, \"management_fee\": 1.02, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
 				"\n",
 				"---Example 4 Start---",
 				"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
 				"---Example 4 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
 				"\n",
 				"---Example 5 Start---",
 				"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund – Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund – Class A \n0.96% 0.05%\n\n",
 				"---Example 5 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund – Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund – Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
 			],
 			"performance_fee_costs": [
 				"### Performance fees",
 				"Performance fees is share class level data.",
 				"A. If the performance fees is with the range, please ignore and output empty.",
 				"---Example Start---",
 				"Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.",
 				"---Example End---",
 				"The relevant values: 0.00 and 2.18, are in the range, so the output should be:",
 				"{\"data\": []}",
 				"B. If the table is only about Cost of product, should be excluded, ",
 				"---Example Start---",
 				"Fund name \nCost of product \nCFS Index Australian Bond Fund \n$155 \n",
 				"---Example End---",
 				"The data is about Cost of product, should be excluded, the output for Performance fees should be:",
 				"{\"data\": []}",
 				"C. If with pure performance fee in table, please extract relevant values",
 				"---Example Start---",
 				"\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n",
 				"---Example End---",
 				"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
 				"b. This example mentioned share classes, please output according to share class.",
 				"The output should be",
 				"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}",
 				"D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0",
 				"---Example Start---",
 				"Fund/Investment Option \nManagement Fees \nand Costs \n(% pa) \n1 \nPerformance Fees 2 \n(% pa) \nTransaction Costs 3 \n(% pa) \nBT American Share Fund 1.08 0.00 0.00\nBT Asian Share Fund 1.10 0.00 0.10",
 				"---Example End---",
 				"a. For this example, as Performance fee mentioned as 0.00 so return 0 as performance fee datapoint value.",
 				"E. If for performance fee specifically Nil is written in the value then return NULL('') for the same",
 				"---Example Start---",
 				"Vanguard Investor Short Term Fixed Interest Fund PLUS Performance fees Nil \nAnd, you will be charged or have deducted \nfrom your investment $0 in performance fees \neach year.",
 				"---Example End---",
 				"a. For this example, as Performance fee mentioned as Nil so return NULL('') as performance fee datapoint value.",
 				"F. If you found Example in the header of the table then ignore that table and do not extract value from the same table",
 				"---Example Start---",
 				"Example - Vanguard Investor Short Term Fixed Interest Fund \nContribution fees Nil  \nFor every additional $5,000 you put in, you \nwill be charged $0.  \nPLUS Management fees and  \ncosts 3,4 \n0.19% p.a. of the NAV of the Fund \nAnd, for every $500,000 you have in the \nFund, you will be charged or have deducted \nfrom your investment $950 each year. \nPLUS Performance fees Nil \nAnd, you will be charged or have deducted \nfrom your investment $0 in performance fees \neach year.",
 				"---Example End---",
 				"a. For this example, you have Example keyword in the header so you should not extract any datapoint values Like performance_fee_costs, management fee etc."
 			],
 			"minimum_initial_investment": [
 				"### Minimum initial investment",
 				"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
 				"---Example 1 Start---",
 				"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"minimum_initial_investment\": 5000}]}",
 				"\n",
 				"---Example 2 Start---",
 				"Current minimum amounts \nSummary information \nFurther information \nInvestment amount \n$20,000 per Pension Plan account \nOperating your account \nInvestment amount/balance per \ninvestment option \n$5,000 \nOperating your account \n",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 5000}]}",
 				"\n",
 				"---Example 3 Start---",
 				"Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.",
 				"---Example 3 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Prime Super\", \"minimum_initial_investment\": 10000}]}",
 				"\n",
 				"---Example 4 Start---",
 				"Minimum \nPlatform operators \nIndirect investors \ninvestment \namounts and their platform operators \nInitial – $500,000 \nAdditional – $5,000 \nMinimum investment amounts are subject to the arrangements between indirect investors \n",
 				"---Example 4 End---",
 				"The minimum initial investment is under the \"Initial\", the value is $500,000.",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 500000}]}",
 				"\n",
 				"---Example 5 Start---",
 				"Lifeplan Investment Bond Product Disclosure Statement \n\nThe Lifeplan Bond at a glance \n\nAt a glance Description Refer to page(s) \nContributions and access \nto your investment \n• \n• \nWe provide choice and flexibility for your investment with access to your money at anytime. \nStart your investment with as little as $1,000. \n• \nEstablish a regular savings plan. \n28 \n• \nYou can switch between the investment options and also rebalance within your selected \noptions at any time. \n• \nMinimum withdrawal – $500. \n",
 				"---Example 5 End---",
 				"If can't find the specific fund name, please apply production name, e.g. Lifeplan Investment Bond", 
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Lifeplan Investment Bond\", \"minimum_initial_investment\": 1000}]}"
 			],
 			"benchmark_name": [
 				"### Benchmark name",
 				"Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ",
 				"Sometime, there are multiple benchmark names with weightings in the context, please extract them all including weightings and benchmark names.",
 				"A. Examples for single benchmark name",
 				"---Example 1 Start---",
 				"MLC Property Securities Fund \nInvestment objective \nAims to outperform the Benchmark (after fees and before tax) over 5 year periods. \nBenchmark \nS&P/ASX 300 A-REIT Total Return Index \n",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC Property Securities Fund\", \"benchmark_name\": \"S&P/ASX 300 A-REIT Total Return Index\"}]}",
 				"---Example 2 Start---",
 				"First Sentier Wholesale Global Listed Infrastructure Fund \n\nGLOBAL PROPERTY AND INFRASTRUCTURE SECURITIES \n\nObjective \n\nTo deliver capital growth and inflation \nprotected income by investing in \na globally diversified portfolio of \ninfrastructure securities. The fund aims \nto outperform the FTSE Global Core \nInfrastructure 50-50 (Net TR) Index \nhedged to Australian dollars over rolling \nthree-year periods before fees and taxes.",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"First Sentier Wholesale Global Listed Infrastructure Fund\", \"benchmark_name\": \"FTSE Global Core Infrastructure 50-50 (Net TR) Index\"}]}",
 				"---Example 3 Start---",
 				"MLC Horizon 5 Growth Portfolio \nInvestment objective\nAims to grow by more than inflation +3.5% pa (after fees and tax) over 10 years. \nBenchmark\nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \nHow the investment option is\nmanaged\nA diversified portfolio that ’ s predominantly weighted towards the more traditionally growth-focused \nassets that tend to provide higher levels of long-term capital growth (eg shares), with a small exposure to \nthe more stable, defensive asset classes of cash and fixed income. \n",
 				"---Example 3 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC Horizon 5 Growth Portfolio\", \"benchmark_name\": \"Consumer Price Index\"}]}",
 				"---Example 4 Start---",
 				"Benchmark returns over 25 years by Traditional asset class \n\nPast performance is not a reliable indicator of future performance.\n\nMarket indices: Australian shares – S&P/ASX300 Accumulation Index, International shares – MSCI World Ex-Australia \nIndex (Unhedged)",
 				"---Example 4 End---",
 				"For this case, after keywords: \"Market indices\", there are multiple benchmark names with format: Fund name – Benchmark name,",
 				"please extract them all from the context contents.",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Australian shares\", \"benchmark_name\": \"S&P/ASX300 Accumulation Index\"}, {\"fund name\": \"International shares\", \"benchmark_name\": \"MSCI World Ex-Australia Index (Unhedged)\"}]}",
 				"---Example 5 Start---",
 				"Dimensional Global Small Company Trust – Active ETF\nFund name \nregistered with ASIC \nDimensional Global Small Company Trust \n\nInvestment objective \nThe investment objective of the Fund is to provide long term capital growth by gaining\nexposure to a diversified portfolio of small companies associated with approved developed \nmarkets (excluding Australia). \nThe Fund is not managed with the objective of achieving a particular return relative to a \nbenchmark index. However, to compare the performance of the Fund with a broad measure \nof market performance, reference may be made to the MSCI World ex Australia Small Cap\nIndex (net div.). \nThe index is referred to for comparison purposes only. The index is not intended to \nrepresent the current or targeted asset allocation of the Fund. The performance of the Fund \nmay differ significantly from the index. \n",
 				"---Example 5 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Dimensional Global Small Company Trust – Active ETF\", \"benchmark_name\": \"MSCI World ex Australia Small Cap Index (net div.)\"}]}",
 				"---Example 6 Start---",
 				"MLC Inflation \nPlus portfolios \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nMLC Inflation Plus - Conservative Portfolio \n\nAims to deliver a return of: \n\nSuper 1.7% pa above inflation (after fees and tax), \nPension (Pre-retirement phase) 1.7% pa above inflation (after fees and tax), or \nPension (Retirement phase) 2% pa above inflation (after fees and tax), \nsubject to limiting the risk of negative returns over 3 year periods. \n\nThis careful risk management approach means there may be times, such as when interest rates are \nunusually low, when the portfolio requires an extended time period to achieve its return objective. \n\nIn most circumstances the portfolio is expected to provide positive returns over 3 year periods, \nalthough there will sometimes be negative returns over shorter periods. \n\nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \n\n",
 				"----Example 6 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"MLC Inflation Plus - Conservative Portfolio\", \"benchmark_name\": \"Consumer Price Index\"}]}",
 				"---Example 7 Start---",
 				"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nPerpetual SHARE-PLUS Long-Short \n13.98% \n(maximum 15%) \nBenchmark S&P/ASX 300 Accumulation Index plus 2% pa \nHalf-yearly \n",
 				"---Example 7 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Perpetual SHARE-PLUS Long-Short\", \"benchmark_name\": \"S&P/ASX 300 Accumulation Index plus 2% pa\"}]}",
 				"---Example 8 Start---",
 				"Australian Shares \n\nBT Australian Share Fund BT Imputation Fund \n\nBenchmark S&P/ASX 300 (TR) Index S&P/ASX 300 (TR) Index \n\nBT Geared Imputation Fund BT Smaller Companies Fund \n\nBenchmark S&P/ASX 300 (TR) Index \n\nS&P/ASX Small Ordinaries \nAccumulation Index \n\n",
 				"---Example 8 End---",
 				"Description:",
 				"This is a complex example for multiple fund names which each fund name with single benchmark name.",
 				"The fund name line is with 2 funds.",
 				"e.g. \"\n\nBT Australian Share Fund BT Imputation Fund \n\n\", with 2 fund names: \"BT Australian Share Fund\" and \"BT Imputation Fund\".",
 				"The benchmark name line is with 2 benchmark names.",
 				"e.g. \"\n\nBenchmark S&P/ASX 300 (TR) Index S&P/ASX 300 (TR) Index \n\n\", with 2 benchmark names: \"S&P/ASX 300 (TR) Index\" and \"S&P/ASX 300 (TR) Index\".",
 				"Therefore, the output should be:",
 				"{\"data\": [{\"fund name\": \"BT Australian Share Fund\", \"benchmark_name\": \"S&P/ASX 300 (TR) Index\"}, {\"fund name\": \"BT Imputation Fund\", \"benchmark_name\": \"S&P/ASX 300 (TR) Index\"}, {\"fund name\": \"BT Geared Imputation Fund\", \"benchmark_name\": \"S&P/ASX 300 (TR) Index\"}, {\"fund name\": \"BT Smaller Companies Fund\", \"benchmark_name\": \"S&P/ASX Small Ordinaries Accumulation Index\"}]}",
 				"\n",
 				"B. Example for multiple benchmark names",
 				"---Example 1 Start---",
 				"Investment options other \nthan MLC portfolios \n\nFixed income \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nThe investment option may be \nsuited to you if... \n\nMinimum suggested time to \ninvest \n\nAsset allocation \n\nStandard Risk Measure \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nThe investment option may be \nsuited to you if... \n\nMinimum suggested time to \ninvest \n\nAsset allocation \n\nStandard Risk Measure \n\nMacquarie Income Opportunities Fund \n\nThe fund aims to outperform the Benchmark over the medium term (before fees). It aims to provide \nhigher income returns than traditional cash investments at all stages of interest rate and economic \ncycles. \n\nBloomberg AusBond Bank Bill Index \n\nThe fund predominantly provides exposure to a wide range of domestic and global investment grade \nfloating and fixed rate instruments, asset-backed securities, and cash. The fund may also have \nopportunistic exposure to other fixed income sectors and instruments such as, high yield and emerging \n\nmarkets debt as well as other fixed income instruments. Interest rate risk will generally be hedged \nthrough the use of derivatives such as swaps and futures. \n\nThe investment process aims to reduce the risk of the fund being adversely affected by unexpected \nevents or downgrades in the credit rating of the fund ’ s investments. A disciplined framework is used \nto analyse each sector and proposed investment to assess its risk. \n\nThe fund may be exposed to derivatives to implement its investment strategy. For example, protection \nmay be purchased on issuers that are believed to be over-valued or at risk of downgrade. These \npositions increase in value when the underlying instrument falls in value and decrease in value when \nthe underlying instrument rises in value. \n\nThe portfolio is generally hedged to Australian dollars. However, any exposure to emerging markets \ndebt issued in the local currency of the debt will generally be unhedged. Small active currency positions \nmay also be taken when the investment manager believes that there are opportunities to add value \nor hedge risks in the portfolio. \n\nyou want a medium term investment horizon, seeking a steady and reliable income stream. \n\n3 years \n\nAsset class \n\nInvestment grade credit* \n\nHigh yield \n\nEmerging markets debt** \n\nCash \n\n* Includes Australian and global investment grade credit. \n** May include holdings of sub-investment grade instruments. \n\nRanges \n\n0 – 100% \n\n0 – 25% \n\n0 – 25% \n\n0 – 100% \n\nMedium to high (estimate of 3 to 4 negative annual returns in any 20 year period) \n\nPIMCO Diversified Fixed Interest Fund - Wholesale Class \n\nTo achieve maximum total return by investing in underlying funds that invest in Australian and \nglobal bonds, and to seek to preserve capital through prudent investment management. \n\n50% Bloomberg Barclays Global Aggregate Index (Hedged in Australian dollars) and 50% Bloomberg \nAusBond Composite 0+ Yr Index \n\n",
 				"---Example 1 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Macquarie Income Opportunities Fund\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"PIMCO Diversified Fixed Interest Fund - Wholesale Class\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (Hedged in Australian dollars) and 50% Bloomberg AusBond Composite 0+ Yr Index\"}]}",
 				"---Example 2 Start---",
 				"Australian shares continued \n\nAusbil Australian Emerging Leaders Fund \nInvestment objective \nTo provide returns above the Benchmark over the medium to long term (before fees and tax). \nBenchmark \n70% S&P/ASX Midcap 50 Accumulation Index \n30% S&P/ASX Small Ordinaries Accumulation Index \nHow the investment option is \nmanaged \nThe fund predominantly invests in a portfolio of mid and small cap Australian equities primarily \nchosen from the S&P/ASX 300 Index, but generally excludes securities from the S&P/ASX 50 Index. \nAt all times the fund will favour sectors and specific companies which it believes will experience \npositive earnings revisions. \n",
 				"---Example 2 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
 				"---Example 3 Start---",
 				"Composite benchmarks \n\nThe objective for some funds includes a reference to a composite benchmark. They may be subject to change \nat any time within the allocation ranges. \n\nFund name \nComposite benchmark \nCFS Defensive \nBuilder \n4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core \nWholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index \n(Unfrozen) – Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate \nIndex (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg \nAusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index. \nCFS Growth Builder \n29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% \nMSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, \n5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD \nhedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE \nDeveloped Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure \nIndex (Unfrozen) – Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate \nCorporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index. \n",
 				"---Example 3 End---",
 				"For this example, there are multiple fund names with multiple benchmark names with weightings, please extract them all.",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"CFS Defensive Builder\", \"benchmark_name\": \"4.0% S&P/ASX 300 Accumulation Index, 4.0% MSCI All Country World ex Australia Net Index, 2.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) – Post-fee Total Return (50th Percentile), 25.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 18.0% Bloomberg Global Aggregate Index (AUD hedged), 20.0% Bloomberg AusBond Composite 0+Yr Index, 20.0% Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"CFS Growth Builder\", \"benchmark_name\": \"29.0% S&P/ASX 300 Accumulation Index, 25.0% MSCI All Country World ex Australia Net Index, 14.0% MSCI All Country World ex Australia Net Index (AUD hedged), 3.0% MSCI World Small Cap Index, 5.0% MSCI Emerging Markets Index, 2.0% FTSE EPRA Nareit Developed ex Aus Rental Index (AUD hedged), 2.0% MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index, 2.0% FTSE Developed Core Infra 50/50 100% Hedged to AUD, 4.0% MSCI Australia Quarterly Private Infrastructure Index (Unfrozen) – Post-fee Total Return (50th Percentile), 12.0% Bloomberg Global Aggregate Corporate Index (hedged AUD), 2.0% Bloomberg AusBond Bank Bill Index\"}]}",
 				"---Example 4 Start---",
 				"\n\nInvestment option \n1 \nPerformance - fee rate \n2 \nPerformance hurdle \n4 \n(maximum fee \n3 \n) \nPayment \nfrequency \nAusbil Australian \nEmerging Leaders \n15.375% \n(maximum 15% ex-GST) \nComposite benchmark (70% S&P/ASX Midcap 50 \nAccumulation Index and 30% S&P/ASX Small Ordinaries \nAccumulation Index) plus 0.85% pa \nMonthly \n",
 				"---Example 4 End---",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index and 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}",
 				"---Example 5 Start---",
 				"Fixed interest / Income \nCash \nUBS Diversified Fixed Income BT Cash Management Trust \nBenchmark \n50% Bloomberg Barclays Global \nAggregate Index (A$ hedged), 50% \nBloomberg AusBond Composite \n0+ Yr Index* \nBloomberg AusBond Bank Bill Index* \n",
 				"---Example 5 End---",
 				"For this example, please read carefully for fund names in same line: \"UBS Diversified Fixed Income BT Cash Management Trust\", there are 2 fund names: \"UBS Diversified Fixed Income\" and \"BT Cash Management Trust\".",
 				"There are 2 benchmark names: \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\" and \"Bloomberg AusBond Bank Bill Index\".",
 				"The output should be:",
 				"{\"data\": [{\"fund name\": \"UBS Diversified Fixed Income\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (A$ hedged), 50% Bloomberg AusBond Composite 0+ Yr Index\"}, {\"fund name\": \"BT Cash Management Trust\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}]}",
 				"\n",
 				"C. Don't extract benchmark name from context when fit below cases.",
 				"1. Exclude benchmark name when its reported name is \"Return target\".",
 				"2. Exclude benchmark name which start with \"CPI minus\" or \"CPI plus\" or \"CPI +\" or \"CPI -\".",
 				"---Example 1 Start---",
 				"A closer look at our sector investment options \n\nCash¹\nDiversified Fixed Interest\nReturn target CPI minus 0.5% per annum on average over 20 years.",
 				"---Example 1 End---",
 				"Explanation:",
 				"The CPI minus 0.5% target is not suitable for Cash and Diversified Fixed Interest funds. ", 
 				"Because these funds have different objectives: Cash funds focus on capital preservation and liquidity, aligning with short-term interest rates, ", 
 				"while Diversified Fixed Interest funds aim to reflect bond market performance, influenced by interest rates and credit risk, not inflation.",
 				"The output should be:",
 				"{\"data\": []}",
 				"---Example 2 Start---",
 				"Infrastructure1 Australian Shares \n\nReturn target \n\nCPI plus 2.0% per annum on average over 20 years. CPI plus 4.0% per annum on average over 20 years. \n\n",
 				"---Example 2 End---",
 				"Explanation:",
 				"The terms \"CPI plus 2.0% per annum\" and \"CPI plus 4.0% per annum\" are return targets, not benchmarks. ", 
 				"A benchmark is typically a specific index or standard used to measure the performance of an investment, such as the S&P/ASX 200 for Australian shares. ", 
 				"Return targets, on the other hand, are goals set by the fund to achieve a certain level of performance over a specified period, in this case, 20 years. ", 
 				"They indicate the desired outcome rather than serving as a comparative measure against market performance.",
 				"The output should be:",
 				"{\"data\": []}",
 				"D. If extracted multiple benchmark names, but without weightings, e.g. 50% or 30%, please ignore and output empty.",
 				"---Example Start---",
 				"This is calculated by using the weighted average of the \nasset allocation neutral position and the index returns for each asset class. \n\nBT Multi-manager Growth Fund, BT Multi-manager Balanced Fund, BT Multi-manager Conservative Fund \nand BT Multi-manager High Growth Fund \n\nAsset class Indices \nAustralian shares S&P/ASX 300 Accumulation Index \nInternational shares MSCI World ex Australia $A (Net Dividends Reinvested) \nMSCI World ex Australia Hedged $A (Net Dividends Reinvested) \nMSCI Emerging Market (Net Dividends Reinvested) in AUD \nAustralian property S&P/ASX 300 A-REIT Accumulation Index \nInternational property FTSE EPRA/NAREIT Developed Hedged in AUD Net TRI \n",
 				"---Example End---",
 				"The output should be:",
 				"{\"data\": []}"
 			]
 		},
 		"sepcial_rule_by_keywords":
 		{
 			"management_fee_and_costs": [
 				{
 					"keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"],
 					"keywords_is_regex": false,
 					"sub_datapoints": ["administration_fees", "performance_fee_costs"],
 					"prompts": [
 					"### Complex management fee and costs rule", 
 					"If the table with columns:",
 					"\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"",
 					"The administration_fees is \"Administration fees\"",
 					"The management_fee is \"Investment fees\".",
 					"The management_fee_and_costs is \"Investment fees\" + \"Estimated other investment costs\".",
 					"The performance_fee_costs is \"Estimated performance fees\"",
 					"---Example 1 Start---",
 					"\nInvestment option \nAdministration fees \nEstimated administration costs \nInvestment fees \nEstimated performance fees \nEstimated other investment costs \nEstimated transaction costs \nEstimated total ongoing annual fees and costs \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nInternetional shares \nPerpetual Global \nInnovation Share \n0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n",
 					"---Example 1 End---", 
 					"The data points numbers order in data row (for example: 0.25% \n0.00% \n0.99% \n2.30 \n0.01% \n0.27% \n3.82% \n) is correct as initial table structure.",
 					"Please pay attention below information",
 					"Assume the column sequence number is from 1.", 
 					"\"Administration fees\" values are as the column 1 numbers, \"Investment fees\" values are as the column 3 numbers, \"Estimated other investment costs\" values are as the column 5 numbers, \"Estimated performance fees\" values are as the column 4 numbers.",
 					"For fund: Perpetual Global Innovation Share, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.99, the management_fee_and_costs should be 1 = 0.99(the column 3 number) + 0.01 (the column 5 number), the performance_fee_costs should be 2.3 (the column 4 number)",
 					"Therefore, the output should be:",
 					"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"administration_fees\": 0.25}]}, {\"fund name\": \"Perpetual Global Innovation Share\", \"share name\": \"Perpetual Global Innovation Share\", \"management_fee_and_costs\": 1, \"management_fee\": 0.99, \"administration_fees\": 0.25, \"performance_fee_costs\": 2.3}"
 					]
 				},
 				{
 					"keywords": ["Entry Fee option \nNil Entry option"],
 					"keywords_is_regex": false,
 					"sub_datapoints": ["performance_fee_costs"],
 					"prompts": [
 					"### Complex management fee and costs rule",
 					"If the table with columns:",
 					"\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"",
 					"The performance_fee_costs is \"Estimated Performance fees\"",
 					"The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"",
 					"The fund name's tail is \"Nil Entry\" for \"Nil Entry option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".",
 					"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
 					"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry option\" + \"Estimated other investment costs\".",
 					"---Example 1 Start---",
 					"\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
 					"---Example 1 End---",
 					"Please pay attention below information",
 					"Assume the numeric column sequence is from 1.", 
 					"\"Entry Fee option\" values are as the 1st column values, \"Nil Entry option\" values are as the 2nd column values, \"Estimated other investment costs\" values are as the 3rd column values, \"Estimated Performance fees\" values are as the 4th column values.",
 					"Here is the example to get data, step by step.",
 					"For this fund in Example:",
 					"Platinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
 					"Step 1 Get new fund name",
 					"Combine \"Platinum Asia\" with \"Entry Fee\" as \"Platinum Asia Entry Fee\"",
 					"Combine \"Platinum Asia\" with \"Nil Entry\" as \"Platinum Asia Nil Entry\"",
 					"Step 2 **EXCLUE the values of the last three columns of data.**",
 					"ONLY KEEP these 4 values: 2.14 2.99 0.02 0.00 for next steps",
 					"Step 3 Calculate management_fee and management_fee_and_costs for these 2 new funds:",
 					"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (Value of 1st column) + 0.02 (Value of 3rd column)",
 					"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (Value of 2nd column) + 0.02 (Value of 3rd column)",
 					"**Make sure don't take \"Estimated other investment costs\" value from the wrong column!!!**",
 					"Step 4 Get performance_fee_costs",
 					"the fund: Platinum Asia Entry Fee, performance_fee_costs is 0 (Value of 4th column)",
 					"the fund: Platinum Asia Nil Entry, performance_fee_costs is 0 (Value of 4th column)",
 					"Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0",
 					"**Make sure don't take \"Estimated Performance fees\" value from the wrong column!!!**",
 					"Please ignore the last fund name of previous PDF page, and extract data as these 4 steps for all of records in Context.",
 					"Therefore, the output should be:",
 					"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
 					]
 				},
 				{
 					"keywords": ["Retirement and TTR income streams"],
 					"keywords_is_regex": false,
 					"prompts": [
 					"### Complex management fee and costs rule",
 					"For management_fee_and_costs, ", 
 					"a. If the title is \"Retirement and TTR income streams\"", 
 					"it means each investment name is with two fund names, one is for Retirement as pension, another is for TTR.",
 					"For example, if the investment name is \"Defensive Growth\", the Retirement fund name is \"Defensive Growth Pension\", the TTR fund name is \"Defensive Growth TTR\".",
 					"b. If the title is \"Retirement income stream only\"",
 					"it means each investment name is with only one fund name, it is for Retirement as pension.",
 					"For example, if the investment name is \"Lifestyle Growth\", the Retirement fund name is \"Lifestyle Growth Pension\".",
 					"c. If the title is \"TTR income stream only\"",
 					"it means each investment name is with only one fund name, it is for TTR.",
 					"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
 					"---Example 1 Start---",
 					"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares, 0.07% p.a. for Cash \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
 					"---Example 1 End---",
 					"Please read the context carefully, especially for \"Retirement and TTR income streams\" part, output all of fund names and relevant values",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, , {\"fund name\": \"Cash Pension\", \"share name\": \"Cash Pension\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Cash TTR\", \"share name\": \"Cash TTR\", \"management_fee_and_costs\": 0.07, \"management_fee\": 0.07}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
 					]
 				},
 				{
 					"keywords": ["Recoverable expenses \nEstimated other indirect costs"],
 					"keywords_is_regex": false,
 					"sub_datapoints": ["performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread"],
 					"prompts": [
 					"### Complex management fee and costs rule",
 					"If the table with columns:",
 					"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",
 					"The management_fee is \"Management fee (% pa)\".",
 					"The management_fee_costs is \"Management fee (% pa)\" + \"Recoverable expenses\" + \"Estimated other indirect costs\".",
 					"The recoverable_expenses is \"Recoverable expenses\"",
 					"The indirect_costs is \"Estimated other indirect costs\"",
 					"The performance_fee_costs is \"Peformance fees charged to the Investment Option by underlying managers\".",
 					"The interposed_vehicle_performance_fee_cost is \"Performance fees charged by interposed vehicles\"",
 					"The buy_spread and sell_spread are \"Buy/sell spreads\".",
 					"---Example 1 Start---",
 					"Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \nNorth Active Defensive \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \nNorth Active Moderately \nDefensive \n0.72 \n0.07 \n0.04 \n0.00 \n0.01 \n0.14 \n0.09/0.09 \nNorth Index Growth \n0.45 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.06/0.06 \nNorth Index High Growth \n0.45 \n0.00 \n0.01 \n0.00 \n0.00 \n0.01 \n0.06/0.07 \n",
 					"---Example 1 End---",
 					"For this case: ", 
 					"a. The fund name is before the data row, e.g. North Active Defensive",
 					"c. The data points numbers in data row. ", 
 					"For example: \n0.62 \n0.18 \n0.05 \n0.00 \n0.00 \n0.14 \n0.08/0.08 \n is with correct order as initial table structure.",
 					"The 1st number: 0.62 is the management_fee,", 
 					"the 2nd number: 0.18 is the recoverable_expenses,",
 					"the 3rd number: 0.05 is the indirect_costs",
 					"the 4th number: 0.00 is the performance_fee_costs,", 
 					"the 5th number: 0.00 is the interposed_vehicle_performance_fee_cost, ", 
 					"the 6th number: 0.14 is the Transaction costs (% pa).", 
 					"the 7th number: 0.08 is the buy_spread, ",
 					"the 8th number: 0.08 is the sell_spread.",
 					"The management_fee_and_costs is Management fee (i) + Recoverable expenses + Estimated other indirect costs = 0.62 + 0.18 + 0.05= 0.85",
 					"**Attention: Ignore Transaction costs (% pa), the 6th number, DO NOT APPLY ITS VALUE TO CALCULATE management_fee_and_costs!!!**",
 					"The output should be: ",
 					"{\"data\": [{\"fund name\": \"North Active Defensive\", \"share name\": \"North Active Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.62, \"recoverable_expenses\": 0.18, \"indirect_costs\": 0.05, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"North Active Moderately Defensive\", \"share name\": \"Active Moderately Defensive\", \"management_fee_and_costs\": 0.83, \"management_fee\": 0.72, \"recoverable_expenses\": 0.07, \"indirect_costs\": 0.04,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0.01, \"buy_spread\": 0.09, \"sell_spread\": 0.09}, {\"fund name\": \"North Index Growth\", \"share name\": \"North Index Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.06}, {\"fund name\": \"North Index High Growth\", \"share name\": \"North Index High Growth\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.45, \"recoverable_expenses\": 0, \"indirect_costs\": 0.01,\"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.06, \"sell_spread\": 0.07}]}",
 					"---Example 2 Start---",
 					"Investment Option \nManagement fee (% pa) \nRecoverable expenses \nEstimated other indirect costs \nPerformance fees charged to the Fund by underlying managers \nPerformance fees charged by interposed vehicles \nTransaction costs (% pa) \nBuy/sell spreads (%) \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n",
 					"---Example 2 End---",
 					"For this case: ", 
 					"a. This table header is same as Example 1.",
 					"b. The algorithm to calculate management_fee_and_costs is same as Example 1.",
 					"c. The difference is **the fund name is after the data row, e.g. the fund name of the first data row is: MyNorth Australian Fixed Interest Index**",
 					"The output should be: ",
 					"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"recoverable_expenses\": 0, \"indirect_costs\": 0, \"performance_fee_costs\": 0, \"interposed_vehicle_performance_fee_cost\": 0, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
 					]
 				},
 				{
 					"keywords":["Plus other investment fees and costs \nEquals investment fees and costs"],
 					"keywords_is_regex": false,
 					"sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"],
 					"prompts": [
 					"### Complex management fee and costs rule",
 					"If the table with columns:",
 					"\"Performance fee\", \"Plus other investment fees and costs\", \"Equals investment fees and costs\", \"Transaction costs(net)\", \"Buy-sell spreads\", \"Transaction costs(gross)\".",
 					"Both of the management_fee and management_fee_costs  are \"Plus other investment fees and costs\".",
 					"The performance_fee_costs is \"Performance fee\".",
 					"The buy_spread and sell_spread are \"Buy-sell spreads\".",
 					"---Example Start---",
 					"Performance fee \nPlus other investment fees and costs \nEquals investment fees and costs \nTransaction costs(net) \nBuy-sell spreads \nTransaction costs(gross) \nMLC multi-asset portfolios\nMLC Inflation Plus\nConservative Portfolio\nSuper & Pension \npre-retirement phase \n0.18 \n0.77 \n0.95 \n0.01 \n0.10 / 0.10 \n0.09 \nRetirement Phase \n0.18 \n0.77 \n0.95 \n0.04 \n0.10 / 0.10 \n0.09 \n",
 					"---Example End---",
 					"Identify the value of the 1st column \"Performance fee\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0",
 					"Please ignore the 3rd column: \"Equals investment fees and costs\" values!!",
 					"Please read context carefully, don't miss any data row!!",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
 					]
 				},
 				{
 					"keywords":["Total\\s*administration\\s*and (management|investment)\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"],
 					"keywords_is_regex": true,
 					"sub_datapoints": ["administration_fees", "performance_fee_costs", "buy_spread", "sell_spread"],
 					"prompts": [
 					"### Complex management fee and costs rule",
 					"---Example Start---",
 					"Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n",
 					"---Example End---",
 					"For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ", 
 					"\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ", 
 					"\"Performance fee (p.a.)\" as performance_fee_costs, ", 
 					"\"Buy/sell spread (%)\" as buy_spread and sell_spread.",
 					"If one row has 5 decimal numbers, ", 
 					"the 2nd decimal number is the administration_fees, ",
 					"the 3rd decimal number is the management_fee_and_costs and management_fee, ", 
 					"the 4th decimal number is the performance_fee_costs, ", 
 					"the 5th decimal number is the buy_spread and sell_spread.",
 					"If one row has 4 decimal numbers, ", 
 					"the 2nd decimal number is the administration_fees, ",
 					"the 3rd decimal number is the management_fee_and_costs and management_fee, ", 
 					"the 4th decimal number is the buy_spread and sell_spread.",
 					"\"Buy/sell spread\" is always as the last decimal value column, for buy_spread and sell_spread, please extract all of them.", 
 					"Please always ignore the 1st decimal number, we need not the total sum values.",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
 					]
 				},
 				{
 					"keywords":["Total\\s*of\\s*(management|investment)\\s*fees\\s*and\\s*costs\\s*and\\s*performance\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"],
 					"keywords_is_regex": true,
 					"sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"],
 					"prompts": [
 					"### Complex management fee and costs rule",
 					"---Example Start---",
 					"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
 					"---Example End---",
 					"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
 					"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
 					"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
 					"If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.",
 					"If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.",
 					"So the output should be:",
 					"{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
 					]
 				}
 			]
 		}
 	},
@ -176,7 +924,7 @@
 			{
 				"title": "Don't fetch data with number range statement",
 				"contents":[
-					"If the value is with number range statement, e.g. \"up to\" or \"from to\" or \"between and\", please ignore the value.",
+					"If the value is with number range statement, e.g. \"up to\" or \"from to\" or \"between and\" or \"to\", please ignore the value.",
 					"Example 1:",
 					"-----Example Start-----",
 					"A-Class\nB-Class\nC-Class\n",
@ -202,25 +950,28 @@
 	},
 	"output_requirement": {
 		"common": [
 			"If possible, please extract fund name, share name, data points values as the output.",
 			"If find fund name, and exist sub fund name, please output fund name + sub fund name, e.g. fund name is \"Black Rock European\", sub fund name is \"Growth\", the output fund name should be: \"Black Rock European Growth\".",
 			"Only output the data point which with relevant value.",
 			"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
 			"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
 			"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
 			"Please also output the data point reported name in context.",
 			"Example:",
 			"---Example Start---",
 			"\n Investment option \nInvestment option \nmanagement \ncosts1  \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2  \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net)  \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross)  \n% p.a. \n(A + B)\nTotal Management \nfees and costs  \n(net)  \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\nAlphinity Sustainable Share Fund\n0.95\n0.60\n0.42\n1.55\n1.37\nAntipodes Global Fund\n1.20\n0.60\n0.42\n1.80\n1.62\n",
 			"---Example End---",
 			"Output:",
 			"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"management_fee_and_costs\": 1.37, \"management_fee\": 0.95, \"administration_fees\": 0.42}, {\"fund name\": \"Antipodes Global Fund\", \"share name\": \"Antipodes Global Fund\", \"management_fee_and_costs\": 1.62, \"management_fee\": 1.20, \"administration_fees\": 0.42}]",
 			"Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.",
 			"The output should be JSON format, the format is like below example(s):"
 		],
-		"fund_level": [
+		"fund_level": {
-			"[{\"fund name\": \"fund 1 - sub fund name 1\",\"benchmark_name\": \"S&P 500 Index Fund\"}, {\"fund name\": \"fund 2 - sub fund name 2\",\"benchmark_name\": \"FTSE All Share\"}]"
+			"fund_name":[
-		],
+				"fund 1",
 				"fund 2",
 				"fund 3"
 			],
 			"benchmark_name_value":[
 				"S&P/ASX 300 Accumulation Index plus 2% pa",
 				"FTSE EPRA/NAREIT Developed Index",
 				"Bloomberg AusBond Bank Bill Index"
 			],
 			"minimum_initial_investment_value": [1000, 5000, 10000]
 		},
 		"share_level": {
 			"fund_name": [
 				"fund 1",
@ -232,11 +983,10 @@
 				"share 2",
 				"share 3"
 			],
-			"total_annual_dollar_based_charges_value": [125.00, 95.00, 26.00],
+			"total_annual_dollar_based_charges_value": [65, 57, 67.6],
 			"management_fee_and_costs_value": [2.63, 1.58, 2.55],
 			"management_fee_value": [0.85, 1.10, 0.23],
-			"performance_fee_value": [0.03, 0.21, 0.08],
+			"performance_fee_costs_value": [0.03, 0.21, 0.08],
 			"performance_fee_costs_value": [0.05, 0.25, 0.09],
 			"buy_spread_value": [0.10, 0.15, 0.12],
 			"sell_spread_value": [0.10, 0.10, 0.15],
 			"establishment_fee_value": [0.75, 1.20, 0.25],
@ -253,41 +1003,42 @@
 			"date_of_last_hwm_reset_value": ["29 March 2023", "18 April 2024", "19 October 2021"],
 			"date_of_last_performance_fee_restructure_value": ["12 August 2022", "15 March 2024", "11 November 2023"],
 			"high_water_mark_type_value": ["Total Return", "Excess Return", "Both TR & ER"],
-			"minimum_initial_investment_value": [0, 5000, 10000],
+			"indirect_costs_value": [0.12, 0.16, 0.02],
-			"recoverable_expenses_value": [0.12, 0.05, 0.06],
+			"recoverable_expenses_value": [0.01, 0.05, 0.06],
-			"indirect_costs_value": [0.12, 0.16, 0.02]
+			"change_recoverable_expenses_value": [0.01, 0.02, 0.03]
 		},
 		"dp_reported_name" : {
 			"total_annual_dollar_based_charges": "Total annual dollar based charges",
 			"management_fee_and_costs": "Management fee and costs",
 			"management_fee": "Management fee",
-			"performance_fee": "Performance fee",
+			"performance_fee_costs": "Performance fee",
 			"performance_fee_costs": "Performance fee costs",
 			"buy_spread": "Buy spread",
 			"sell_spread": "Sell spread",
 			"administration_fees": "Administration fee",
 			"interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost", 
 			"benchmark_name": "Benchmark name",
 			"minimum_initial_investment": "Minimum initial investment",
 			"indirect_costs": "Indirect cost",
 			"recoverable_expenses": "Recoverable expenses",
 			"change_recoverable_expenses": "Change recoverable expenses",
 			"establishment_fee": "Establishment fee",
 			"contribution_fee": "Contribution fee",
 			"withdrawal_fee": "Withdrawal fee",
 			"switching_fee": "Switching fee",
 			"activity_fee": "Activity fee",
 			"exit_fee": "Exit fee",
 			"administration_fees": "Administration fee",
 			"interposed_vehicle_performance_fee_cost": "Interposed vehicle performance fee cost", 
 			"additional_hurdle": "Additional hurdle",
 			"benchmark_name": "Benchmark name",
 			"reference_rate": "Reference rate",
 			"crystallisation_frequency": "Crystallisation frequency",
 			"date_of_last_hwm_reset": "Date of last hwm reset",
 			"date_of_last_performance_fee_restructure": "Date of last performance fee restructure",
-			"high_water_mark_type": "High-water mark type",
+			"high_water_mark_type": "High-water mark type"
 			"minimum_initial_investment": "Minimum initial investment",
 			"recoverable_expenses": "Recoverable expenses",
 			"indirect_costs": "Indirect cost"
 		}
 	},
 	"end": [
 		"Only output JSON data.",
-		"Don't output the value which not exist in context.",
+		"Please re-check before output answer, DO NOT output the data point and value which not exist in context.",
 		"DO NOT use the example values from a representative fund (such as Balanced Growth) for other funds unless explicitly mentioned",
 		"If can't find fund name or share class name in context, please output empty JSON data: {\"data\": []}"
 	]
 }
--- a/instructions/aus_prospectus/document_category_prompts.json
+++ b/instructions/aus_prospectus/document_category_prompts.json
@ -0,0 +1,32 @@
 {
 	"prompts": [
        "1. Identify document category: Super or MIS\n",
        "In a prospectus for an MIS(Managed Investment Scheme) product you’ll typically see references to a “responsible entity”, a registration number (ARSN) and disclosures that comply with the Corporations Act’s regime for managed investment schemes (e.g. pooling of funds, unit trusts, detailed product disclosures, and rules on redemption).\n", 
        "In contrast, a prospectus or product disclosure statement for a Super(superannuation) product will refer to superannuation or MySuper, include terms related to compulsory employer contributions, tax concessions, and comply with superannuation-specific legislation and guidelines (for example, those issued by APRA or the ATO).\n",
        "In short, look at the headings, statutory references, product descriptions, and regulatory disclaimers: if they discuss “managed investment schemes” or “responsible entities” and related disclosure obligations under the Corporations Act, it’s an MIS document; if they mention superannuation, MySuper, employer contributions, and similar features, then it belongs to the Super regime.\n",
        "• To determine the regime of a document, simply check the beginning pages where the fund is mentioned.",
        "• If keywords like Pension, Transition to Retirement (TTR), Super, Income Stream, or Accumulation are found, the document belongs to the Super regime; In this case, the business rules associated with the Super regime should be applied.", 
        "If these keywords are not present, the document falls under the MIS regime.",
        "Please identify whether the document belongs to the Super or MIS regime according to the context, and output answer as JSON format.",
        "The example is: {\"document_category\": \"Super\"}\n",
        "\n",
        "2. Get production name from document context. \n",
        "The production name is the name of the relevant fund(s) product that the document is about. It is usually found in the title of the document or in the first few pages of the document.\n",
        "Please provide the production name as a string.\n",
        "----Example 1 context start----",
        "MLC MasterKey Super & \nPension Fundamentals \n\nYour Guide to what is included in the MLC MasterKey Super \n& Pension Fundamentals Product Disclosure Statement \n\nPreparation date \n\n30 September 2022 \n\nIssued by the Trustee \n\nNULIS Nominees \n(Australia) Limited \nABN 80 008 515 633 \nAFSL 236465 \n\nThe Fund \n\nMLC Super Fund \nABN 70 732 426 024 \n\nMLC MasterKey Super & \nPension Fundamentals \n\nProduct Disclosure Statement \n\nThe Insurer \n\nInsurance is issued by \nMLC Limited \nABN 90 000 000 \n402 AFSL 230694 \n\n1. About MLC MasterKey Super & Pension Fundamentals \n\nYou can use this Product Disclosure Statement (PDS) to find what you need to know \nabout your super and how we can help you reach your retirement goals",
        "----Example 1 context end----",
        "The output should be:", 
        "{\"document_production\": \"MLC MasterKey Super & Pension Fundamentals\"}\n",
        "\n",
        "----Example 2 context start----",
        "Pension \nProduct Disclosure \nStatement \n\nThis legalsuper Pension Product Disclosure Statement is \n\nissued by Legal Super Pty Ltd, Level 9, 627 Chapel Street, \nSouth Yarra, 3141 (ABN 37 004 455 789, AFSL 246315) \nas the Trustee for legalsuper ABN 60 346 078 879. \n\nIssued 1 April 2023\nLEGALSUPER PENSION \nPRODUCT DISCLOSURE STATEMENT \n\nIssued by Legal Super Pty Ltd \nLevel 9, 627 Chapel Street, South Yarra, 3141 \nABN 37 004 455 789, AFSL 246315, L0002585 \nlegalsuper ABN 60 346 078 879 (the Fund) \n\nFund Contact Details \n\nPhone: 1800 060 312 (8am to 8pm [AEST/AEDT] Monday to Friday)\nEmail: mail@legalsuper.com.au \nlegalsuper.com.au \n\nDate of preparation: 1 April 2023",
        "----Example 2 context end----",
        "The output should be:",
        "{\"document_production\": \"legalsuper Pension\"}\n",
        "\n",
        "3. Output format, please provide the answer with both of document_category and document_production format, here is the example:\n",
        "{\"document_category\": \"Super\", \"document_production\": \"ABC Superannuation Fund\"}\n",
        "Answer:\n"
    ]
 }
--- a/instructions/aus_prospectus/objective_fund_name_prompts.json
+++ b/instructions/aus_prospectus/objective_fund_name_prompts.json
@ -0,0 +1,15 @@
 {
 	"prompts": [
        "Get the fund name from document context. \n",
        "The document context contains fund investment objective(s).\n",
        "1. Please locate the last investment objective in the document context.\n",
        "2. Please provide the relevant fund name for the last investment objective.\n",
        "3. Usually, the fund name can be found in the several upon lines of the last investment objective.\n",
        "----Example context start----",
        "\n\nMLC Horizon 4 Balanced Portfolio \n\nThis option invests in a wide range of asset classes with a strong bias towards shares and other growth assets. It ’ s designed for members who \nare focused on higher returns and are willing to take on exposure to more volatile investments. \n\nMLC Horizon 4 Balanced Portfolio \nInvestment objective \nAims to grow by more than inflation +3% pa (after fees and tax) over 10 years. \nBenchmark \nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \nThe investment option may be \nsuited to you if... \nyou want your investment to exceed changes in the costs of living, over the long term \nyou want a higher emphasis on growth than stability \nyou understand returns may be higher or lower than its objective, and \nyou value active management. \n\n3 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement",
        "----Example context end----",
        "The output should be as JSON format:", 
        "{\"fund_name\": \"MLC Horizon 4 Balanced Portfolio\"}\n",
        "Answer:\n"
    ]
 }
--- a/instructions/emea_ar/compare_table_structure_prompts.json
+++ b/instructions/emea_ar/compare_table_structure_prompts.json
@ -0,0 +1,9 @@
 {
 	"prompts": [
        "Assume there is a data table in current page contents, is there the table with same table structure in the next page contents?", 
        "The meaning of \"same\" is: with totally same table columns for the table in both of current page and next page.",
        "Please output JSON format, the format example is:",
        "{\"answer\": \"Yes\"} or {\"answer\": \"No\"}",
        "Answer:\n"
    ]
 }
--- a/instructions/emea_ar/data_extraction_prompts_config.json
+++ b/instructions/emea_ar/data_extraction_prompts_config.json
@ -97,7 +97,8 @@
 				"The performance fees should not be the presence of the rates at which the performance fees are calculated.",
 				"The reported of performance fees should not be \"% based on the NAV at the end of the accounting period\""
 			]
-		}
+		},
 		"sepcial_rule_by_keywords": {}
 	},
 	"special_cases": {
 		"common": [
@ -342,9 +343,18 @@
 			"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
 			"The output should be JSON format, the format is like below example(s):"
 		],
-		"fund_level": [
+		"fund_level": {
-			"[{\"fund name\": \"fund 1 - sub fund name 1\",\"tor\": 35.26}, {\"fund name\": \"fund 2 - sub fund name 2\",\"tor\": -28.26}, {\"fund name\": \"fund 3\",\"tor\": 115.52,}]"
+			"fund_name": [
-		],
+				"fund 1 - sub fund name 1",
 				"fund 2 - sub fund name 2",
 				"fund3"
 			],
 			"tor_value": [
 				35.26,
 				-28.26,
 				115.52
 			]
 		},
 		"share_level": {
 			"fund_name": [
 				"fund 1",
--- a/main.py
+++ b/main.py
@ -1,5 +1,6 @@
 import os
 import json
 import numpy as np
 import pandas as pd
 from glob import glob
 from tqdm import tqdm
@ -18,7 +19,7 @@ from core.data_extraction import DataExtraction
 from core.data_mapping import DataMapping
 from core.auz_nz.hybrid_solution_script import api_for_fund_matching_call
 from core.metrics import Metrics
-
+import certifi
 class EMEA_AR_Parsing:
    def __init__(
@ -150,6 +151,7 @@ class EMEA_AR_Parsing:
                data_from_gpt = data_extraction.extract_data()
            except Exception as e:
                logger.error(f"Error: {e}")
                print_exc()
                data_from_gpt = {"data": []}
        # Drilldown data to relevant PDF document
@ -277,7 +279,39 @@ class EMEA_AR_Parsing:
                )
                with open(json_file, "r", encoding="utf-8") as f:
                    doc_mapping_data = json.load(f)
-                return doc_mapping_data
+                if self.doc_source == "aus_prospectus":
                    output_data_folder_splits = output_data_json_folder.split("output")
                    if len(output_data_folder_splits) == 2:
                        merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
                        os.makedirs(merged_data_folder, exist_ok=True)
                        merged_data_json_folder = os.path.join(merged_data_folder, "json/")
                        os.makedirs(merged_data_json_folder, exist_ok=True)
                        merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
                        os.makedirs(merged_data_excel_folder, exist_ok=True)
                        merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
                        if os.path.exists(merged_data_file):
                            with open(merged_data_file, "r", encoding="utf-8") as f:
                                merged_data_list = json.load(f)
                            return merged_data_list
                        else:
                            data_mapping = DataMapping(
                                                self.doc_id,
                                                self.datapoints,
                                                data_from_gpt,
                                                self.document_mapping_info_df,
                                                self.output_mapping_data_folder,
                                                self.doc_source,
                                                compare_with_provider=self.compare_with_provider
                                            )
                            merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data,
                                                                                             merged_data_json_folder,
                                                                                             merged_data_excel_folder)
                        return merged_data_list
                else:
                    return doc_mapping_data
        """
        doc_id,
        datapoints: list,
@ -419,7 +453,6 @@ def batch_start_job(
    pdf_folder: str = "/data/emea_ar/pdf/",
    output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
    doc_data_excel_file: str = None,
    document_mapping_file: str = None,
    output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
    output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
    output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
@ -497,7 +530,17 @@ def batch_start_job(
            )
        logger.info(f"Saving mapping data to {output_mapping_total_folder}")
-        unique_doc_ids = result_mappingdata_df["doc_id"].unique().tolist()
+        result_mappingdata_df_columns = list(result_mappingdata_df.columns)
        doc_id_column = ""
        if "doc_id" in result_mappingdata_df_columns:
            doc_id_column = "doc_id"
        if "DocumentId" in result_mappingdata_df_columns:
            doc_id_column = "DocumentId"
        if doc_id_column == "":
            logger.error(f"Cannot find doc_id column in mapping data")
            return
        unique_doc_ids = result_mappingdata_df[doc_id_column].unique().tolist()
        os.makedirs(output_mapping_total_folder, exist_ok=True)
        time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
        file_name = f"mapping_data_info_{len(unique_doc_ids)}_documents_by_{extract_way}_{time_stamp}.xlsx"
@ -505,11 +548,11 @@ def batch_start_job(
            file_name = f"{total_data_prefix}_{file_name}"
        output_file = os.path.join(output_mapping_total_folder, file_name)
-        doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
+        # doc_mapping_data_in_db = only_output_mapping_data_in_db(result_mappingdata_df)
        with pd.ExcelWriter(output_file) as writer:
-            doc_mapping_data_in_db.to_excel(
+            # doc_mapping_data_in_db.to_excel(
-                writer, index=False, sheet_name="data_in_doc_mapping"
+            #     writer, index=False, sheet_name="data_in_doc_mapping"
-            )
+            # )
            result_mappingdata_df.to_excel(
                writer, index=False, sheet_name="total_mapping_data"
            )
@ -517,27 +560,6 @@ def batch_start_job(
                writer, index=False, sheet_name="extract_data"
            )
        if (
            doc_source == "aus_prospectus"
            and document_mapping_file is not None
            and len(document_mapping_file) > 0
            and os.path.exists(document_mapping_file)
        ):
            try:
                merged_total_data_folder = os.path.join(
                    output_mapping_total_folder, "merged/"
                )
                os.makedirs(merged_total_data_folder, exist_ok=True)
                data_file_base_name = os.path.basename(output_file)
                output_merged_data_file_path = os.path.join(
                    merged_total_data_folder, "merged_" + data_file_base_name
                )
                merge_output_data_aus_prospectus(
                    output_file, document_mapping_file, output_merged_data_file_path
                )
            except Exception as e:
                logger.error(f"Error: {e}")
        if calculate_metrics:
            prediction_sheet_name = "data_in_doc_mapping"
            ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
@ -1028,22 +1050,21 @@ def batch_run_documents(
    doc_source: str = "emea_ar",
    special_doc_id_list: list = None,
    pdf_folder: str = r"/data/emea_ar/pdf/",
    document_mapping_file: str = None,
    output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
    output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
    output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
    output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
    output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
    drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
    re_run_extract_data: bool = True,
    re_run_mapping_data: bool = True,
    force_save_total_data: bool = False
 ):
    sample_document_list_folder = r"./sample_documents/"
    document_list_files = glob(sample_document_list_folder + "*.txt")
    page_filter_ground_truth_file = (
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
    )
    re_run_extract_data = True
    re_run_mapping_data = True
    force_save_total_data = False
    calculate_metrics = False
    extract_way = "text"
@ -1067,7 +1088,6 @@ def batch_run_documents(
                pdf_folder,
                output_pdf_text_folder,
                page_filter_ground_truth_file,
                document_mapping_file,
                output_extract_data_child_folder,
                output_mapping_child_folder,
                output_extract_data_total_folder,
@ -1087,7 +1107,6 @@ def batch_run_documents(
            pdf_folder,
            output_pdf_text_folder,
            page_filter_ground_truth_file,
            document_mapping_file,
            output_extract_data_child_folder,
            output_mapping_child_folder,
            output_extract_data_total_folder,
@ -1220,9 +1239,11 @@ def merge_output_data_aus_prospectus(
 ):
    # TODO: merge output data for aus prospectus, plan to realize it on 2025-01-16
    data_df = pd.read_excel(data_file_path, sheet_name="total_mapping_data")
    data_df.fillna("", inplace=True)
    document_mapping_df = pd.read_excel(
        document_mapping_file, sheet_name="document_mapping"
    )
    document_mapping_df.fillna("", inplace=True)
    # set doc_id to be string type
    data_df["doc_id"] = data_df["doc_id"].astype(str)
    document_mapping_df["DocumentId"] = document_mapping_df["DocumentId"].astype(str)
@ -1271,10 +1292,11 @@ def merge_output_data_aus_prospectus(
            for exist_raw_name_info in exist_raw_name_list:
                exist_raw_name = exist_raw_name_info["raw_name"]
                exist_investment_type = exist_raw_name_info["investment_type"]
                exist_investment_id = exist_raw_name_info["investment_id"]
                if (
                    exist_raw_name == raw_name
                    and exist_investment_type == investment_type
-                ):
+                ) or (len(exist_investment_id) > 0 and exist_investment_id == share_class_id):
                    exist = True
                    break
            if not exist:
@ -1294,7 +1316,7 @@ def merge_output_data_aus_prospectus(
                for datapoint_name in datapoint_name_list:
                    data[datapoint_name] = ""
                exist_raw_name_list.append(
-                    {"raw_name": raw_name, "investment_type": investment_type}
+                    {"raw_name": raw_name, "investment_type": investment_type, "investment_id": share_class_id}
                )
                doc_data_list.append(data)
            # find data from total_data_list by raw_name
@ -1305,10 +1327,18 @@ def merge_output_data_aus_prospectus(
                    if page_index not in data["page_index"]:
                        data["page_index"].append(page_index)
                    break
                if len(share_class_id) > 0 and data["sec_id"] == share_class_id:
                    update_key = datapoint
                    if len(str(data[update_key])) == 0:
                        data[update_key] = value
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                    break
        fund_doc_data_df = data_df[
            (data_df["doc_id"] == doc_id) & (data_df["investment_type"] == 33)
        ]
        fund_doc_data_df.fillna("", inplace=True)
        for index, row in fund_doc_data_df.iterrows():
            doc_id = str(row["doc_id"])
            page_index = int(row["page_index"])
@ -1319,7 +1349,6 @@ def merge_output_data_aus_prospectus(
            value = row["value"]
            fund_id = row["investment_id"]
            fund_legal_name = row["investment_name"]
            exist = False
            if fund_id != "":
                for data in doc_data_list:
@ -1331,7 +1360,14 @@ def merge_output_data_aus_prospectus(
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                        exist = True
-
+            else:
                for data in doc_data_list:
                    if data["raw_name"] == raw_name:
                        update_key = datapoint
                        data[update_key] = value
                        if page_index not in data["page_index"]:
                            data["page_index"].append(page_index)
                        exist = True
            if not exist:
                data = {
                    "DocumentId": doc_id,
@ -1357,23 +1393,125 @@ def merge_output_data_aus_prospectus(
        total_data_df.to_excel(writer, index=False, sheet_name="total_data")
 def get_aus_prospectus_document_category():
    document_sample_file = (
            r"./sample_documents/aus_prospectus_17_documents_sample.txt"
        )
    with open(document_sample_file, "r", encoding="utf-8") as f:
        special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
    pdf_folder: str = r"/data/aus_prospectus/pdf/"
    output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder: str = (
        r"/data/aus_prospectus/output/extract_data/docs/"
    )
    output_mapping_child_folder: str = (
        r"/data/aus_prospectus/output/mapping_data/docs/"
    )
    drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
    doc_source = "aus_prospectus"
    extract_way = "text"
    document_category_dict = {}
    for doc_id in special_doc_id_list:
        emea_ar_parsing = EMEA_AR_Parsing(
            doc_id,
            doc_source=doc_source,
            pdf_folder=pdf_folder,
            output_pdf_text_folder=output_pdf_text_folder,
            output_extract_data_folder=output_extract_data_child_folder,
            output_mapping_data_folder=output_mapping_child_folder,
            extract_way=extract_way,
            drilldown_folder=drilldown_folder,
            compare_with_provider=False
        )
        data_extraction = DataExtraction(
            doc_source=emea_ar_parsing.doc_source,
            doc_id=emea_ar_parsing.doc_id,
            pdf_file=emea_ar_parsing.pdf_file,
            output_data_folder=emea_ar_parsing.output_extract_data_folder,
            page_text_dict=emea_ar_parsing.page_text_dict,
            datapoint_page_info=emea_ar_parsing.datapoint_page_info,
            datapoints=emea_ar_parsing.datapoints,
            document_mapping_info_df=emea_ar_parsing.document_mapping_info_df,
            extract_way=extract_way
        )
        logger.info(f"Document: {doc_id}, \ncategory: {data_extraction.document_category}, \nproduction: {data_extraction.document_production}")
        document_category_dict[doc_id] = {"category": data_extraction.document_category, "production": data_extraction.document_production}
    output_extract_document_category_folder: str = (
        r"/data/aus_prospectus/output/document_category/"
    )
    os.makedirs(output_extract_document_category_folder, exist_ok=True)
    document_sample_file_base_name = os.path.basename(document_sample_file).replace(".txt", "").replace("aus_prospectus_", "")
    output_file = os.path.join(output_extract_document_category_folder, f"{document_sample_file_base_name}_category_production.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(document_category_dict, f, ensure_ascii=False, indent=4)
    logger.info(f"Document category and production: {document_category_dict}")
 def test_post_adjust_extract_data():
    doc_id = "480854121"
    pdf_folder: str = r"/data/aus_prospectus/pdf/"
    output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder: str = (
        r"/data/aus_prospectus/output/extract_data/docs/"
    )
    output_mapping_child_folder: str = (
        r"/data/aus_prospectus/output/mapping_data/docs/"
    )
    drilldown_folder = r"/data/aus_prospectus/output/drilldown/"
    doc_source = "aus_prospectus"
    extract_way = "text"
    emea_ar_parsing = EMEA_AR_Parsing(
            doc_id,
            doc_source=doc_source,
            pdf_folder=pdf_folder,
            output_pdf_text_folder=output_pdf_text_folder,
            output_extract_data_folder=output_extract_data_child_folder,
            output_mapping_data_folder=output_mapping_child_folder,
            extract_way=extract_way,
            drilldown_folder=drilldown_folder,
            compare_with_provider=False
        )
    data_extraction = DataExtraction(doc_source=emea_ar_parsing.doc_source,
            doc_id=emea_ar_parsing.doc_id,
            pdf_file=emea_ar_parsing.pdf_file,
            output_data_folder=emea_ar_parsing.output_extract_data_folder,
            page_text_dict=emea_ar_parsing.page_text_dict,
            datapoint_page_info=emea_ar_parsing.datapoint_page_info,
            datapoints=emea_ar_parsing.datapoints,
            document_mapping_info_df=emea_ar_parsing.document_mapping_info_df,
            extract_way=extract_way)
    data_folder = r"/data/aus_prospectus/output/extract_data/docs/by_text/json/"
    data_file = f"{doc_id}.json"
    data_file_path = os.path.join(data_folder, data_file)
    with open(data_file_path, "r", encoding="utf-8") as f:
        data_list = json.load(f)
    # data_list = data_extraction.remove_duplicate_data(data_list)
    # data_list = data_extraction.post_adjust_for_value_with_production_name(data_list)
    data_list = data_extraction.post_supplement_data(data_list)
 if __name__ == "__main__":
    # test_post_adjust_extract_data()
    # get_aus_prospectus_document_category()
    # test_data_extraction_metrics()
-    # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_11_documents_by_text_20250116220811.xlsx"
+    # data_file_path = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_1_documents_by_text_20250226155259.xlsx"
-    # document_mapping_file_path = r"/data/aus_prospectus/basic_information/11_documents/document_mapping.xlsx"
+    # document_mapping_file_path = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
    # merged_total_data_folder = r'/data/aus_prospectus/output/mapping_data/total/merged/'
    # os.makedirs(merged_total_data_folder, exist_ok=True)
    # data_file_base_name = os.path.basename(data_file_path)
    # output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name)
    # merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path)
-    # doc_source = "aus_prospectus"
+    os.environ["SSL_CERT_FILE"] = certifi.where()
-    # sample_document_list_folder: str = r'./sample_documents/'
+
-    # document_list_file: str = "aus_prospectus_100_documents_multi_fund_sample.txt"
+    doc_source = "aus_prospectus"
-    # pdf_folder: str = r"/data/aus_prospectus/pdf/"
+    sample_document_list_folder: str = r'./sample_documents/'
-    # output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
+    document_list_file: str = "aus_prospectus_29_documents_sample.txt"
-    # output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
+    pdf_folder: str = r"/data/aus_prospectus/pdf/"
-    # output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
+    output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
    output_extract_data_child_folder: str = r"/data/aus_prospectus/output/extract_data/docs/"
    output_mapping_child_folder: str = r"/data/aus_prospectus/output/mapping_data/docs/"
    # batch_initial_document(sample_document_list_folder=sample_document_list_folder,
    #                        document_list_file=document_list_file,
    #                        doc_source=doc_source,
@ -1382,34 +1520,29 @@ if __name__ == "__main__":
    #                        output_extract_data_child_folder=output_extract_data_child_folder,
    #                        output_mapping_child_folder=output_mapping_child_folder)
-    # special_doc_id_list = ["553242411"]
+    # get_aus_prospectus_document_category()
    re_run_extract_data = True
    re_run_mapping_data = True
    force_save_total_data = True
    doc_source = "aus_prospectus"
    # doc_source = "emea_ar"
    if doc_source == "aus_prospectus":
        # document_sample_file = (
-        #     r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
+        #     r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
        # )
        document_sample_file = (
-            r"./sample_documents/aus_prospectus_17_documents_sample.txt"
+            r"./sample_documents/aus_prospectus_46_documents_sample.txt"
        )
        # document_sample_file = (
        #     r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
        # )
        logger.info(f"Start to run document sample file: {document_sample_file}")
        with open(document_sample_file, "r", encoding="utf-8") as f:
-            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
+            special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
-        # document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
+                                    if len(doc_id.strip()) > 0]
-        document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
+        # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
-        # special_doc_id_list: list = [
+        # special_doc_id_list = ["462780211", "539999907"]
        #     "539790009",
        #     "542300403",
        #     "542301117",
        #     "542306317",
        #     "547567013",
        #     "552505237",
        #     "552505278",
        #     "554431052",
        #     "554851189",
        #     "555377021",
        #     "555654388",
        # ]
        special_doc_id_list: list = ["377377369"]
        pdf_folder: str = r"/data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
        output_extract_data_child_folder: str = (
@ -1430,72 +1563,24 @@ if __name__ == "__main__":
            doc_source=doc_source,
            special_doc_id_list=special_doc_id_list,
            pdf_folder=pdf_folder,
            document_mapping_file=document_mapping_file,
            output_pdf_text_folder=output_pdf_text_folder,
            output_extract_data_child_folder=output_extract_data_child_folder,
            output_extract_data_total_folder=output_extract_data_total_folder,
            output_mapping_child_folder=output_mapping_child_folder,
            output_mapping_total_folder=output_mapping_total_folder,
            drilldown_folder=drilldown_folder,
            re_run_extract_data=re_run_extract_data,
            re_run_mapping_data=re_run_mapping_data,
            force_save_total_data=force_save_total_data
        )
    elif doc_source == "emea_ar": 
-        special_doc_id_list = [
+        special_doc_id_list = ["321733631"]
            "292989214",
            "316237292",
            "321733631",
            "323390570",
            "327956364",
            "333207452",
            "334718372",
            "344636875",
            "362246081",
            "366179419",
            "380945052",
            "382366116",
            "387202452",
            "389171486",
            "391456740",
            "391736837",
            "394778487",
            "401684600",
            "402113224",
            "402181770",
            "402397014",
            "405803396",
            "445102363",
            "445256897",
            "448265376",
            "449555622",
            "449623976",
            "458291624",
            "458359181",
            "463081566",
            "469138353",
            "471641628",
            "476492237",
            "478585901",
            "478586066",
            "479042264",
            "479793787",
            "481475385",
            "483617247",
            "486378555",
            "486383912",
            "492121213",
            "497497599",
            "502693599",
            "502821436",
            "503194284",
            "506559375",
            "507967525",
            "508854243",
            "509845549",
            "520879048",
            "529925114",
        ]
        # special_doc_id_list = ["532438210"]
        batch_run_documents(
-            doc_source=doc_source, special_doc_id_list=special_doc_id_list
+            doc_source=doc_source, 
            special_doc_id_list=special_doc_id_list,
            re_run_extract_data=re_run_extract_data,
            re_run_mapping_data=re_run_mapping_data,
            force_save_total_data=force_save_total_data
        )
    # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
--- a/mini_main.py
+++ b/mini_main.py
@ -0,0 +1,398 @@
 import os
 import json
 import numpy as np
 import pandas as pd
 from glob import glob
 from tqdm import tqdm
 import time
 import fitz
 import re
 from io import BytesIO
 from traceback import print_exc
 from utils.logger import logger
 from utils.pdf_download import download_pdf_from_documents_warehouse
 from utils.sql_query_util import query_document_fund_mapping
 from utils.pdf_util import PDFUtil
 from utils.biz_utils import add_slash_to_text_as_regex
 from core.page_filter import FilterPages
 from core.data_extraction import DataExtraction
 from core.data_mapping import DataMapping
 from core.auz_nz.hybrid_solution_script import api_for_fund_matching_call
 from core.metrics import Metrics
 import certifi
 class EMEA_AR_Parsing:
    def __init__(
        self,
        doc_id: str,
        doc_source: str = "emea_ar",
        pdf_folder: str = r"/data/emea_ar/pdf/",
        output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
        output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
        extract_way: str = "text",
        drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
        text_model: str = "qwen-plus",
        image_model: str = "qwen-vl-plus",
    ) -> None:
        self.doc_id = doc_id
        self.doc_source = doc_source
        self.pdf_folder = pdf_folder
        os.makedirs(self.pdf_folder, exist_ok=True)
        self.pdf_file = self.download_pdf()
        if extract_way is None or len(extract_way) == 0:
            extract_way = "text"
        self.extract_way = extract_way
        self.output_extract_image_folder = None
        if self.extract_way == "image":
            self.output_extract_image_folder = (
                r"/data/emea_ar/output/extract_data/images/"
            )
            os.makedirs(self.output_extract_image_folder, exist_ok=True)
        if output_extract_data_folder is None or len(output_extract_data_folder) == 0:
            output_extract_data_folder = r"/data/emea_ar/output/extract_data/docs/"
        if not output_extract_data_folder.endswith("/"):
            output_extract_data_folder = f"{output_extract_data_folder}/"
        if extract_way is not None and len(extract_way) > 0:
            output_extract_data_folder = (
                f"{output_extract_data_folder}by_{extract_way}/"
            )
        self.output_extract_data_folder = output_extract_data_folder
        os.makedirs(self.output_extract_data_folder, exist_ok=True)
        self.filter_pages = FilterPages(
            self.doc_id,
            self.pdf_file,
            self.doc_source,
            output_pdf_text_folder,
        )
        self.page_text_dict = self.filter_pages.page_text_dict
        self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
        self.datapoints = self.get_datapoints_from_datapoint_page_info()
        if drilldown_folder is None or len(drilldown_folder) == 0:
            drilldown_folder = r"/data/emea_ar/output/drilldown/"
        os.makedirs(drilldown_folder, exist_ok=True)
        self.drilldown_folder = drilldown_folder
        misc_config_file = os.path.join(
            f"./configuration/{doc_source}/", "misc_config.json"
        )
        if os.path.exists(misc_config_file):
            with open(misc_config_file, "r", encoding="utf-8") as f:
                misc_config = json.load(f)
            self.apply_drilldown = misc_config.get("apply_drilldown", False)
        else:
            self.apply_drilldown = False
        self.text_model = text_model
        self.image_model = image_model
    def download_pdf(self) -> str:
        pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
        return pdf_file
    def get_datapoint_page_info(self) -> tuple:
        datapoint_page_info, result_details = self.filter_pages.start_job()
        return datapoint_page_info, result_details
    def get_datapoints_from_datapoint_page_info(self) -> list:
        datapoints = list(self.datapoint_page_info.keys())
        if "doc_id" in datapoints:
            datapoints.remove("doc_id")
        return datapoints
    def extract_data(
        self,
        re_run: bool = False,
    ) -> list:
        found_data = False
        if not re_run:
            output_data_json_folder = os.path.join(
                self.output_extract_data_folder, "json/"
            )
            os.makedirs(output_data_json_folder, exist_ok=True)
            json_file = os.path.join(output_data_json_folder, f"{self.doc_id}.json")
            if os.path.exists(json_file):
                logger.info(
                    f"The document: {self.doc_id} has been parsed, loading data from {json_file}"
                )
                with open(json_file, "r", encoding="utf-8") as f:
                    data_from_gpt = json.load(f)
                    found_data = True
        if not found_data:
            try:
                data_extraction = DataExtraction(
                    self.doc_source,
                    self.doc_id,
                    self.pdf_file,
                    self.output_extract_data_folder,
                    self.page_text_dict,
                    self.datapoint_page_info,
                    self.datapoints,
                    extract_way=self.extract_way,
                    output_image_folder=self.output_extract_image_folder,
                    text_model=self.text_model,
                    image_model=self.image_model,
                )
                data_from_gpt = data_extraction.extract_data()
            except Exception as e:
                logger.error(f"Error: {e}")
                print_exc()
                data_from_gpt = {"data": []}
        # Drilldown data to relevant PDF document
        annotation_list = []
        if self.apply_drilldown:
            try:
                annotation_list = self.drilldown_pdf_document(data_from_gpt)
            except Exception as e:
                logger.error(f"Error: {e}")
        return data_from_gpt, annotation_list
    def drilldown_pdf_document(self, data_from_gpt: list) -> list:
        logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}")
        pdf_util = PDFUtil(self.pdf_file)
        drilldown_data_list = []
        for data in data_from_gpt:
            doc_id = str(data.get("doc_id", ""))
            page_index = data.get("page_index", -1)
            if page_index == -1:
                continue
            extract_data_list = data.get("extract_data", {}).get("data", [])
            dp_reported_name_dict = data.get("extract_data", {}).get(
                "dp_reported_name", {}
            )
            highlighted_value_list = []
            for extract_data in extract_data_list:
                for data_point, value in extract_data.items():
                    if value in highlighted_value_list:
                        continue
                    if data_point in ["ter", "ogc", "performance_fee"]:
                        continue
                    drilldown_data = {
                        "doc_id": doc_id,
                        "page_index": page_index,
                        "data_point": data_point,
                        "parent_text_block": None,
                        "value": value,
                        "annotation_attribute": {},
                    }
                    drilldown_data_list.append(drilldown_data)
                    highlighted_value_list.append(value)
            for data_point, reported_name in dp_reported_name_dict.items():
                if reported_name in highlighted_value_list:
                    continue
                data_point = f"{data_point}_reported_name"
                drilldown_data = {
                    "doc_id": doc_id,
                    "page_index": page_index,
                    "data_point": data_point,
                    "parent_text_block": None,
                    "value": reported_name,
                    "annotation_attribute": {},
                }
                drilldown_data_list.append(drilldown_data)
                highlighted_value_list.append(reported_name)
        drilldown_result = pdf_util.batch_drilldown(
            drilldown_data_list=drilldown_data_list,
            output_pdf_folder=self.drilldown_folder,
        )
        annotation_list = []
        if len(drilldown_result) > 0:
            logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
            annotation_list = drilldown_result.get("annotation_list", [])
            for annotation in annotation_list:
                annotation["doc_id"] = doc_id
            if self.drilldown_folder is not None and len(self.drilldown_folder) > 0:
                drilldown_data_folder = os.path.join(self.drilldown_folder, "data/")
                os.makedirs(drilldown_data_folder, exist_ok=True)
                drilldown_file = os.path.join(
                    drilldown_data_folder, f"{doc_id}_drilldown.xlsx"
                )
                drilldown_source_df = pd.DataFrame(drilldown_data_list)
                annotation_list_df = pd.DataFrame(annotation_list)
                # set drilldown_result_df column order as doc_id, pdf_file, page_index,
                # data_point, value, matching_val_area, normalized_bbox
                try:
                    annotation_list_df = annotation_list_df[
                        [
                            "doc_id",
                            "pdf_file",
                            "page_index",
                            "data_point",
                            "value",
                            "matching_val_area",
                            "normalized_bbox",
                        ]
                    ]
                except Exception as e:
                    logger.error(f"Error: {e}")
                logger.info(f"Writing drilldown data to {drilldown_file}")
                try:
                    with pd.ExcelWriter(drilldown_file) as writer:
                        drilldown_source_df.to_excel(
                            writer, index=False, sheet_name="source_data"
                        )
                        annotation_list_df.to_excel(
                            writer, index=False, sheet_name="drilldown_data"
                        )
                except Exception as e:
                    logger.error(f"Error: {e}")
                annotation_list = annotation_list_df.to_dict(orient="records")
                try:
                    drilldown_json_file = os.path.join(
                        drilldown_data_folder, f"{doc_id}_drilldown.json"
                    )
                    with open(drilldown_json_file, "w", encoding="utf-8") as f:
                        json.dump(annotation_list, f, ensure_ascii=False, indent=4)
                except Exception as e:
                    logger.error(f"Error: {e}")
        return annotation_list
 def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None:
    logger.info(f"Filter EMEA AR PDF pages for doc_id: {doc_id}")
    emea_ar_parsing = EMEA_AR_Parsing(
        doc_id, doc_source=doc_source, pdf_folder=pdf_folder
    )
    datapoint_page_info, result_details = emea_ar_parsing.get_datapoint_page_info()
    return datapoint_page_info, result_details
 def extract_data(
    doc_id: str,
    doc_source: str,
    pdf_folder: str,
    output_data_folder: str,
    extract_way: str = "text",
    re_run: bool = False,
    text_model: str = "qwen-plus",
    image_model: str = "qwen-vl-plus",
 ) -> None:
    logger.info(f"Extract EMEA AR data for doc_id: {doc_id}")
    emea_ar_parsing = EMEA_AR_Parsing(
        doc_id,
        doc_source=doc_source,
        pdf_folder=pdf_folder,
        output_extract_data_folder=output_data_folder,
        extract_way=extract_way,
        text_model=text_model,
        image_model=image_model,
    )
    data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run)
    return data_from_gpt, annotation_list
 def batch_extract_data(
    pdf_folder: str,
    doc_source: str = "emea_ar",
    output_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
    output_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
    extract_way: str = "text",
    special_doc_id_list: list = None,
    re_run: bool = False,
    text_model: str = "qwen-plus",
    image_model: str = "qwen-vl-plus",
 ) -> None:
    pdf_files = glob(pdf_folder + "*.pdf")
    doc_list = []
    if special_doc_id_list is not None and len(special_doc_id_list) > 0:
        doc_list = special_doc_id_list
    if len(doc_list) == 0:
        logger.info(f"No special doc_id list provided, extracting all documents in {pdf_folder}")
        return
    result_list = []
    for pdf_file in tqdm(pdf_files):
        pdf_base_name = os.path.basename(pdf_file)
        doc_id = pdf_base_name.split(".")[0]
        if doc_list is not None and doc_id not in doc_list:
            continue
        data_from_gpt = extract_data(
            doc_id=doc_id,
            doc_source=doc_source,
            pdf_folder=pdf_folder,
            output_data_folder=output_child_folder,
            extract_way=extract_way,
            re_run=re_run,
            text_model=text_model,
            image_model=image_model,
        )
        result_list.extend(data_from_gpt)
    if special_doc_id_list is None or len(special_doc_id_list) == 0:
        result_df = pd.DataFrame(result_list)
        result_df.reset_index(drop=True, inplace=True)
        logger.info(f"Saving the result to {output_total_folder}")
        os.makedirs(output_total_folder, exist_ok=True)
        time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
        output_file = os.path.join(
            output_total_folder,
            f"extract_data_info_{len(pdf_files)}_documents_{time_stamp}.xlsx",
        )
        with pd.ExcelWriter(output_file) as writer:
            result_df.to_excel(writer, index=False, sheet_name="extract_data_info")
 def test_translate_pdf():
    from core.data_translate import Translate_PDF
    pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
    output_folder = r"/data/translate/output/"
    translate_pdf = Translate_PDF(pdf_file, output_folder)
    translate_pdf.start_job()
 if __name__ == "__main__":
    os.environ["SSL_CERT_FILE"] = certifi.where()
    # doc_source = "aus_prospectus"
    doc_source = "emea_ar"
    re_run = True
    extract_way = "text"
    if doc_source == "aus_prospectus":
        special_doc_id_list = ["412778803", "539266874"]
        pdf_folder: str = r"./data/aus_prospectus/pdf/"
        output_pdf_text_folder: str = r"./data/aus_prospectus/output/pdf_text/"
        output_child_folder: str = (
            r"./data/aus_prospectus/output/extract_data/docs/"
        )
        output_total_folder: str = (
            r"./data/aus_prospectus/output/extract_data/total/"
        )
    elif doc_source == "emea_ar":
        special_doc_id_list = ["514636993"]
        pdf_folder: str = r"./data/emea_ar/pdf/"
        output_child_folder: str = (
            r"./data/emea_ar/output/extract_data/docs/"
        )
        output_total_folder: str = (
            r"./data/emea_ar/output/extract_data/total/"
        )
    else:
        raise ValueError(f"Invalid doc_source: {doc_source}")
    # text_model = "qwen-plus"
    text_model = "qwen-max"
    image_model = "qwen-vl-plus"
    batch_extract_data(
        pdf_folder=pdf_folder,
        doc_source=doc_source,
        output_child_folder=output_child_folder,
        output_total_folder=output_total_folder,
        extract_way=extract_way,
        special_doc_id_list=special_doc_id_list,
        re_run=re_run,
        text_model=text_model,
        image_model=image_model,
    )
--- a/performance.ipynb
+++ b/performance.ipynb
--- a/prepare_data.py
+++ b/prepare_data.py
@ -8,10 +8,12 @@ import re
 import time
 import traceback
 import json_repair
 from copy import deepcopy
 from utils.logger import logger
 from utils.pdf_download import download_pdf_from_documents_warehouse
 from utils.pdf_util import PDFUtil
 from core.auz_nz.hybrid_solution_script import final_function_to_match
 def get_unique_docids_from_doc_provider_data(doc_provider_file_path: str):
@ -1345,14 +1347,19 @@ def calc_typical_doc_metrics_v1():
        )
-def merge_aus_document_prospectus_data():
+def merge_aus_document_prospectus_data(aus_data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
                                       aus_document_mapping_file: str = r"document_mapping.xlsx",
                                       aus_prospectus_data_file: str = r"aus_prospectus_data.xlsx",
                                       document_mapping_sheet: str = "document_mapping",
                                       output_file: str = r"aus_document_prospectus.xlsx",
                                       output_sheet: str = "aus_document_prospectus"):
    """
    Merge AUS document and prospectus data.
    """
-    aus_document_file = r"/data/aus_prospectus/basic_information/from_2024_documents/document_mapping.xlsx"
+    aus_document_mapping_file = os.path.join(aus_data_folder, aus_document_mapping_file)
-    aus_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_prospectus_data.xlsx"
+    aus_prospectus_data_file = os.path.join(aus_data_folder, aus_prospectus_data_file)
-    aus_document_data = pd.read_excel(aus_document_file, sheet_name="document_mapping")
+    aus_document_data = pd.read_excel(aus_document_mapping_file, sheet_name=document_mapping_sheet)
-    aus_prospectus_data = pd.read_excel(aus_prospectus_file)
+    aus_prospectus_data = pd.read_excel(aus_prospectus_data_file)
    aus_document_data["DocumentId"] = aus_document_data["DocumentId"].astype(str)
@ -1362,10 +1369,10 @@ def merge_aus_document_prospectus_data():
        on=["FundClassId", "EffectiveDate"],
        how="left",
    )
-    aus_document_prospectus_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_document_prospectus.xlsx"
+    aus_document_prospectus_file = os.path.join(aus_data_folder, output_file)
    with pd.ExcelWriter(aus_document_prospectus_file) as writer:
        aus_document_prospectus_data.to_excel(
-            writer, sheet_name="aus_document_prospectus", index=False
+            writer, sheet_name=output_sheet, index=False
        )
@ -1384,15 +1391,19 @@ def pdf_exist():
            logger.info(f"pdf file exist: {pdf_file_path}")
-def prepare_multi_fund_aus_prospectus_document():
+def prepare_multi_fund_aus_prospectus_document(data_folder: str = r"/data/aus_prospectus/basic_information/from_2024_documents/",
-    data_folder = r"/data/aus_prospectus/basic_information/from_2024_documents/"
+                                               document_mapping_file: str = "document_mapping.xlsx",
-    document_mapping_file = os.path.join(data_folder, "document_mapping.xlsx")
+                                               document_mapping_sheet: str = "document_mapping",
-    document_data_file = os.path.join(data_folder, "aus_document_prospectus.xlsx")
+                                               document_fund_count_sheet: str = "document_fund_count",
                                               document_data_file: str = "aus_document_prospectus.xlsx",
                                               document_data_sheet: str = "aus_document_prospectus"):
    document_mapping_file = os.path.join(data_folder, document_mapping_file)
    document_data_file = os.path.join(data_folder, document_data_file)
-    document_mapping_df = pd.read_excel(document_mapping_file, sheet_name="document_mapping")
+    document_mapping_df = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
-    document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name="document_fund_count")
+    document_fund_count_df = pd.read_excel(document_mapping_file, sheet_name=document_fund_count_sheet)
-    document_data_df = pd.read_excel(document_data_file, sheet_name="aus_document_prospectus")
+    document_data_df = pd.read_excel(document_data_file, sheet_name=document_data_sheet)
    document_data_df.fillna("", inplace=True)
    # get data from document_data_df which SecurityName is not empty string
    document_data_df = document_data_df[document_data_df["SecurityName"] != ""]
@ -1456,11 +1467,299 @@ def prepare_multi_fund_aus_prospectus_document():
            f.write(f"{doc_id}\n")
 def set_mapping_to_ravi_data():
    data_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx"
    data_sheet = "Sheet1"
    mapping_file_path = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
    mapping_sheet = "document_mapping"
    output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"
    set_mapping_to_raw_name_data(data_file_path, data_sheet, mapping_file_path, mapping_sheet, output_file_path)
 def set_mapping_to_data_side_documents_data():
    # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/Audited file_phase2.xlsx"
    # data_sheet = "all"
    # mapping_file_path = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
    # mapping_sheet = "document_mapping"
    # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx"
    # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx"
    # data_sheet = "ground_truth"
    # raw_name_column = "raw_share_name"
    # mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
    # mapping_sheet = "document_mapping"
    # raw_name_mapping_column = None
    # output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
    data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth.xlsx"
    data_sheet = "ground_truth"
    raw_name_column = "raw_share_name"
    mapping_file_path = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx"
    mapping_sheet = "document_mapping"
    raw_name_mapping_column = None
    output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx"
    set_mapping_to_raw_name_data(data_file_path=data_file_path, 
                                 data_sheet=data_sheet, 
                                 raw_name_column=raw_name_column, 
                                 mapping_file_path=mapping_file_path, 
                                 mapping_sheet=mapping_sheet, 
                                 raw_name_mapping_column=raw_name_mapping_column,
                                 output_file_path=output_file_path)
 def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees.xlsx", 
                                 data_sheet: str = "Sheet1",
                                 raw_name_column: str = "raw_share_name",
                                 mapping_file_path: str = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx",
                                 mapping_sheet: str = "document_mapping",
                                 raw_name_mapping_column: str = None,
                                 output_file_path: str = r"/data/aus_prospectus/output/ravi_100_documents/AUS_Extracted_Fees_with_mapping.xlsx"):
    data_df = pd.read_excel(data_file_path, sheet_name=data_sheet)
    data_df["provider_id"] = ""
    data_df["provider_name"] = ""
    data_df["fund_id"] = ""
    data_df["fund_name"] = ""
    data_df["sec_id"] = ""
    data_df["sec_name"] = ""
    mapping_data = pd.read_excel(mapping_file_path, sheet_name=mapping_sheet)
    doc_id_list = data_df["doc_id"].unique().tolist()
    for doc_id in doc_id_list:
        doc_data = data_df[data_df["doc_id"] == doc_id]
        raw_name_list = doc_data[raw_name_column].unique().tolist()
        doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id]
        if len(doc_mapping_data) == 0:
            continue
        provider_id = doc_mapping_data["CompanyId"].values[0]
        provider_name = doc_mapping_data["CompanyName"].values[0]
        data_df.loc[(data_df["doc_id"] == doc_id), "provider_id"] = provider_id
        data_df.loc[(data_df["doc_id"] == doc_id), "provider_name"] = provider_name
        if raw_name_mapping_column is not None and raw_name_mapping_column == "FundLegalName":
            doc_db_name_list = doc_mapping_data[raw_name_mapping_column].unique().tolist()
            for raw_name in raw_name_list:
                find_df = doc_mapping_data[doc_mapping_data[raw_name_mapping_column] == raw_name]
                if find_df is not None and len(find_df) == 1:
                    sec_id = find_df["FundClassId"].values[0]
                    sec_name = find_df["FundClassLegalName"].values[0]
                    fund_id = find_df["FundId"].values[0]
                    fund_name = find_df["FundLegalName"].values[0]
                    # update doc_data which raw_share_name is same as raw_share_name
                    data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_id"] = sec_id
                    data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "sec_name"] = sec_name
                    data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_id"] = fund_id
                    data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_name), "fund_name"] = fund_name
        else:
            doc_db_name_list = doc_mapping_data["FundClassLegalName"].unique().tolist()
            all_match_result = get_raw_name_db_match_result(doc_id, 
                                                            provider_name, 
                                                            raw_name_list, 
                                                            doc_db_name_list, 
                                                            iter_count=60)
            for raw_share_name in raw_name_list:
                if all_match_result.get(raw_share_name) is not None:
                    matched_db_share_name = all_match_result[raw_share_name]
                    if (
                        matched_db_share_name is not None
                        and len(matched_db_share_name) > 0
                    ):
                        # get SecId from self.doc_fund_class_mapping
                        find_share_df = doc_mapping_data[doc_mapping_data["FundClassLegalName"] == matched_db_share_name]
                        if find_share_df is not None and len(find_share_df) > 0:
                            sec_id = find_share_df["FundClassId"].values[0]
                            fund_id = find_share_df["FundId"].values[0]
                            fund_name = find_share_df["FundLegalName"].values[0]
                            # update doc_data which raw_share_name is same as raw_share_name
                            data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_id"] = sec_id
                            data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "sec_name"] = matched_db_share_name
                            data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_id"] = fund_id
                            data_df.loc[(data_df["doc_id"] == doc_id) & (data_df[raw_name_column] == raw_share_name), "fund_name"] = fund_name
    try:                        
        data_df = data_df[["doc_id",
                            "provider_id",
                            "provider_name",
                            "raw_fund_name",
                            "fund_id",
                            "fund_name",
                            "raw_share_name",
                            "sec_id",
                            "sec_name",
                            "management_fee_and_costs",
                            "management_fee",
                            "administration_fees",
                            "minimum_initial_investment",
                            "benchmark_name",
                            "performance_fee_costs",
                            "buy_spread",
                            "sell_spread",
                            "total_annual_dollar_based_charges",
                            "interposed_vehicle_performance_fee_cost",
                            "establishment_fee",
                            "contribution_fee",
                            "withdrawal_fee",
                            "exit_fee",
                            "switching_fee",
                            "activity_fee"
                            ]]
    except Exception as e:
        print(e)
    with open(output_file_path, "wb") as file:
        data_df.to_excel(file, index=False)
 def get_raw_name_db_match_result(
        doc_id: str, provider_name: str, raw_name_list: list, doc_share_name_list: list, iter_count: int = 30
    ):
        # split raw_name_list into several parts which each part is with 30 elements
        # The reason to split is to avoid invoke token limitation issues from CahtGPT
        raw_name_list_parts = [
            raw_name_list[i : i + iter_count]
            for i in range(0, len(raw_name_list), iter_count)
        ]
        all_match_result = {}
        doc_share_name_list = deepcopy(doc_share_name_list)
        for raw_name_list in raw_name_list_parts:
            match_result, doc_share_name_list = get_final_function_to_match(
                doc_id, provider_name, raw_name_list, doc_share_name_list
            )
            all_match_result.update(match_result)
        return all_match_result
 def get_final_function_to_match(doc_id, provider_name, raw_name_list, db_name_list):
    if len(db_name_list) == 0:
        match_result = {}
        for raw_name in raw_name_list:
            match_result[raw_name] = ""
    else:
        match_result = final_function_to_match(
            doc_id=doc_id,
            pred_list=raw_name_list,
            db_list=db_name_list,
            provider_name=provider_name,
            doc_source="aus_prospectus"
        )
        matched_name_list = list(match_result.values())
        db_name_list = remove_matched_names(db_name_list, matched_name_list)
    return match_result, db_name_list
 def remove_matched_names(target_name_list: list, matched_name_list: list):
    if len(matched_name_list) == 0:
        return target_name_list
    matched_name_list = list(set(matched_name_list))
    matched_name_list = [
        value for value in matched_name_list if value is not None and len(value) > 0
    ]
    for matched_name in matched_name_list:
        if (
            matched_name is not None
            and len(matched_name) > 0
            and matched_name in target_name_list
        ):
            target_name_list.remove(matched_name)
    return target_name_list
 def adjust_data_file(source_file: str,
                     targe_file: str):
    source_data = pd.read_excel(source_file, sheet_name="Sheet1")
    source_doc_id_list = source_data["DocumentId"].unique().tolist()
    target_data = pd.read_excel(targe_file, sheet_name="Sheet1")
    #remove target_data which doc_id is in source_doc_id_list
    target_data = target_data[~target_data["DocumentId"].isin(source_doc_id_list)]
    # concat source_data and target_data
    target_data = pd.concat([source_data, target_data], ignore_index=True)
    with open(targe_file, "wb") as file:
        target_data.to_excel(file, index=False)
 def set_provider_to_ground_truth(groud_truth_file: str, 
                                 ground_truth_sheet: str, 
                                 document_mapping_file: str,
                                 document_mapping_sheet: str):
    ground_truth_df = pd.read_excel(groud_truth_file, sheet_name=ground_truth_sheet)
    ground_truth_df["provider_id"] = ""
    ground_truth_df["provider_name"] = ""
    mapping_data = pd.read_excel(document_mapping_file, sheet_name=document_mapping_sheet)
    doc_id_list = ground_truth_df["DocumentId"].unique().tolist()
    for doc_id in doc_id_list:
        doc_mapping_data = mapping_data[mapping_data["DocumentId"] == doc_id]
        if len(doc_mapping_data) == 0:
            continue
        provider_id = doc_mapping_data["CompanyId"].values[0]
        provider_name = doc_mapping_data["CompanyName"].values[0]
        ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_id"] = provider_id
        ground_truth_df.loc[(ground_truth_df["DocumentId"] == doc_id), "provider_name"] = provider_name
    try:                        
        ground_truth_df = ground_truth_df[["DocumentId",
                                        "provider_id",
                                        "provider_name",
                                        "raw_fund_name",
                                        "FundId",
                                        "FundLegalName",
                                        "raw_share_name",
                                        "FundClassId",
                                        "FundClassLegalName",
                                        "management_fee_and_costs",
                                        "management_fee",
                                        "administration_fees",
                                        "minimum_initial_investment",
                                        "benchmark_name",
                                        "performance_fee",
                                        "performance_fee_charged",
                                        "buy_spread",
                                        "sell_spread",
                                        "total_annual_dollar_based_charges",
                                        "interposed_vehicle_performance_fee_cost",
                                        "establishment_fee",
                                        "contribution_fee",
                                        "withdrawal_fee",
                                        "exit_fee",
                                        "switching_fee",
                                        "activity_fee",
                                        "hurdle_rate",
                                        "analyst_name"
                                        ]]
    except Exception as e:
        print(e)
    with open(groud_truth_file, "wb") as file:
        ground_truth_df.to_excel(file, index=False)
 def update_data_by_latest_ground_truth():
    # TODO: update current ground truth data by the latest version
    latest_ground_truth_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
 if __name__ == "__main__":
    # update_data_by_latest_ground_truth()
    # set_provider_to_ground_truth(
    #     groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx",
    #     ground_truth_sheet="Sheet1",
    #     document_mapping_file=r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx",
    #     document_mapping_sheet="document_mapping"
    # )
    set_mapping_to_data_side_documents_data()
    # source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
    # target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
    # adjust_data_file(source_file=source_file, targe_file=target_file)
    # pdf_exist()
-    prepare_multi_fund_aus_prospectus_document()
+    # prepare_multi_fund_aus_prospectus_document()
-    # merge_aus_document_prospectus_data()
+    # merge_aus_document_prospectus_data(aus_data_folder=r"/data/aus_prospectus/basic_information/17_documents/",
    #                                    aus_document_mapping_file="aus_prospectus_17_documents_mapping.xlsx",
    #                                    aus_prospectus_data_file="aus_prospectus_data_17_documents_secid.xlsx",
    #                                    document_mapping_sheet="document_mapping",
    #                                    output_file="aus_prospectus_17_documents_data.xlsx",
    #                                    output_sheet="aus_document_prospectus")
    folder = r"/data/emea_ar/basic_information/English/sample_doc/emea_11_06_case/"
    file_name = "doc_ar_data_for_emea_11_06.xlsx"
    # get_document_with_all_4_data_points(folder, file_name, None)
--- a/sample_documents/aus_prospectus_29_documents_sample.txt
+++ b/sample_documents/aus_prospectus_29_documents_sample.txt
@ -0,0 +1,29 @@
 530101994
 550769189
 550522985
 539266893
 539241700
 539261734
 550533961
 506913190
 539266814
 521606716
 553449663
 528208796
 539266817
 539266874
 539266880
 526200514
 523516443
 526200513
 521606755
 557526129
 540028470
 531373053
 544886057
 557362556
 557362553
 520663234
 527969661
 541356150
 555377021
--- a/sample_documents/aus_prospectus_46_documents_sample.txt
+++ b/sample_documents/aus_prospectus_46_documents_sample.txt
@ -0,0 +1,46 @@
 377377369
 397107472
 401212184
 409723592
 411062815
 412778803
 414751292
 462770987
 471206458
 391080133
 391080140
 410899007
 420339794
 441280757
 446324179
 454036250
 384508026
 530101994
 550769189
 550522985
 539266893
 539241700
 539261734
 550533961
 506913190
 539266814
 521606716
 553449663
 528208796
 539266817
 539266874
 539266880
 526200514
 523516443
 526200513
 521606755
 557526129
 540028470
 531373053
 544886057
 557362556
 557362553
 520663234
 527969661
 541356150
 555377021
--- a/sample_documents/aus_prospectus_52_documents_sample.txt
+++ b/sample_documents/aus_prospectus_52_documents_sample.txt
@ -0,0 +1,52 @@
 420255482
 417904228
 448577874
 466381788
 502017146
 480854115
 481913497
 479996918
 475093006
 492202154
 495604527
 471624689
 397667293
 447335411
 490252419
 498429268
 369105359
 481728671
 466227438
 489870941
 481909371
 495604806
 557526143
 369219625
 377425488
 420281919
 420333972
 425940066
 439596540
 406568432
 411560137
 412398851
 412698096
 413157970
 319457827
 337806248
 341026731
 344001344
 362917516
 471895618
 450135866
 454032956
 471026558
 471052717
 471608905
 476325788
 478569026
 480716611
 480718722
 485628750
 486915646
 486927510
--- a/sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt
+++ b/sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt
@ -0,0 +1,87 @@
 430229604
 430249980
 434533711
 448576798
 448576868
 448576914
 448576924
 448577874
 448577877
 448578148
 448701586
 448906715
 448906720
 448906722
 448907811
 451234748
 454947973
 454947982
 454948291
 454948296
 455232983
 455235248
 462770987
 470958290
 470958296
 478920274
 478946988
 479996914
 479996918
 480713037
 480726184
 480726185
 480854103
 480854105
 480854113
 480854115
 480854118
 480854120
 480854121
 480854129
 481877313
 484628699
 484628701
 484628702
 484628703
 495516375
 495547519
 500579230
 506913190
 509581748
 520698753
 520702746
 520703007
 521591949
 521606716
 521606755
 523516443
 525464665
 528208796
 534933875
 539999907
 539999916
 540028470
 542294088
 544886057
 548035617
 550533961
 550769189
 552727485
 555377021
 556527310
 557362550
 557526104
 557526108
 557526111
 557526129
 557526130
 557526143
 557526145
 562753667
 562753673
 562754590
 570781265
 572302455
 572302463
 573372424
 577949367
--- a/sample_documents/aus_prospectus_verify_3_documents_sample.txt
+++ b/sample_documents/aus_prospectus_verify_3_documents_sample.txt
@ -0,0 +1,3 @@
 539999907
 455235248
 448576924
--- a/sample_documents/aus_prospectus_verify_6_documents_sample.txt
+++ b/sample_documents/aus_prospectus_verify_6_documents_sample.txt
@ -0,0 +1,6 @@
 553449169
 539791362
 573372424
 448906722
 462780211
 563608192
--- a/test_k_shape.py
+++ b/test_k_shape.py
@ -0,0 +1,77 @@
 import pandas as pd
 import numpy as np
 import sys
 import os
 # 添加项目路径
 sys.path.append('crypto_quant')
 from crypto_quant.core.biz.metrics_calculation import MetricsCalculation
 def test_k_shape():
    # 创建测试数据
    test_data = pd.DataFrame({
        'open': [9.3030000000],
        'high': [9.3030000000], 
        'low': [9.3020000000],
        'close': [9.3020000000]
    })
    print("测试数据:")
    print(test_data)
    print()
    # 计算基本特征
    test_data['high_low_diff'] = test_data['high'] - test_data['low']
    test_data['open_close_diff'] = abs(test_data['close'] - test_data['open'])
    test_data['open_close_fill'] = test_data['open_close_diff'] / test_data['high_low_diff']
    test_data['price_range_ratio'] = test_data['high_low_diff'] / test_data['close'] * 100
    print("计算的特征:")
    print(f"high_low_diff: {test_data['high_low_diff'].iloc[0]}")
    print(f"open_close_diff: {test_data['open_close_diff'].iloc[0]}")
    print(f"open_close_fill: {test_data['open_close_fill'].iloc[0]}")
    print(f"price_range_ratio: {test_data['price_range_ratio'].iloc[0]}%")
    print()
    # 检查"一字"条件
    price_range_ratio = test_data['price_range_ratio'].iloc[0]
    open_close_fill = test_data['open_close_fill'].iloc[0]
    print("条件检查:")
    print(f"price_range_ratio < 0.01: {price_range_ratio < 0.01}")
    print(f"open_close_fill > 0.9: {open_close_fill > 0.9}")
    print()
    # 使用MetricsCalculation类
    mc = MetricsCalculation()
    # 为了测试，我们需要创建一个有足够数据的DataFrame
    # 复制测试数据多次以创建滚动窗口
    extended_data = pd.concat([test_data] * 25, ignore_index=True)
    # 运行set_k_shape函数
    result = mc.set_k_shape(extended_data.copy())
    print("分类结果:")
    print(f"k_shape: {result['k_shape'].iloc[0]}")
    print()
    # 详细分析为什么没有被分类为"一字"
    print("详细分析:")
    print(f"价格范围比例: {price_range_ratio:.6f}%")
    print(f"实体占比: {open_close_fill:.6f}")
    print()
    if price_range_ratio < 0.01:
        print("✓ 满足价格范围比例 < 0.01% 的条件")
    else:
        print(f"✗ 不满足价格范围比例 < 0.01% 的条件 (实际: {price_range_ratio:.6f}%)")
    if open_close_fill > 0.9:
        print("✓ 满足实体占比 > 0.9 的条件")
    else:
        print(f"✗ 不满足实体占比 > 0.9 的条件 (实际: {open_close_fill:.6f})")
 if __name__ == "__main__":
    test_k_shape() 
--- a/utils/benchmark_names.py
+++ b/utils/benchmark_names.py
@ -0,0 +1,169 @@
 benchmark_keywords =[
    "OECD G20 Inflation Index",
    "OECD G7 inflation index",
    "Absolute return (only applicable as an indirect performance fee on underlying investments in the AAM Managed Funds, being applied as 20% of the amount by which the increase in the Fund’s NAV for a financial year exceeds a return of 8% p.a)",
    "MSCI all countries world accumulation Index Unhedged",
    "MSCI World ex Australia Index",
    "S&P / ASX 300 Accumulation Index",
    "RBA Cash Rate",
    "MSCI World Net Index",
    "MSCI All Country World Net Index",
    "CSI300 index",
    "S&P/ASX Small Resources Accumulation Index",
    "Bloomberg AusBond Composite 0+ Yr Index",
    "Bloomberg AusBond Bank Bill Index",
    "Bloomberg Barclays Global Aggregate Bond Index (fully hedged) in Australian dollars",
    "MSCI World ex Australia (Standard) Index (Net Dividends) in Australian dollars",
    "S&P/ASX 300 A-REIT Accumulation Index",
    "FTSE EPRA/NAREIT Developed hedged (AUD) Net TRI",
    "Bloomberg Global Aggregate Index hedged to Australian dollars",
    "MSCI Emerging Markets Net Index in AUD",
    "MSCI World Net Total Return (AUD)",
    "S&P/ASX Small Ordinaries Accumulation Index",
    "MSCI All CountryWorld Index (Net in AUD)",
    "MSCI World Index (Net AUD)",
    "RBA Cash Rate plus 2.5% per annum",
    "Australian Listed Real Assets Index",
    "The Ausbond Bank Bill Index plus 1 %",
    "S&P/ASX Australian Government Bond Index",
    "MSCI World Index (net dividends reinvested) expressed in AUD",
    "JP Morgan Global Government Bond Index expressed in AUD",
    "Reserve Bank of Australia cash rate",
    "S&P/ASX 200 Accumulation Index",
    "MSCI World ex Australia (Standard) Index (Net Dividends)",
    "MSCI AWCI Small Cap Net (AUD) Index",
    "S&P/ASX 200 Total Return Index",
    "MSCI AC Asia ex-Japan Index",
    "MSCI China Free Index",
    "MSCI All Country Asia ex Japan Net Index in AUD",
    "MSCI All Country World Net Index",
    "Bloomberg AusBond Bank 0+Y TR AUD",
    "S&P/ASX 300 TR",
    "MSCI ACWI NR AUD",
    "Credit Suisse Lvg Loans Hdg AUD",
    "MSCI World Ex Australia NR AUD",
    "S&P/ASX 200 TR AUD",
    "FTSE Global Core Infra 50/50 NR Hdg AUD",
    "MSCI World NR AUD",
    "S&P/ASX 300 A-REIT TR",
    "S&P/ASX Small Ordinaries TR AUD",
    "CPI + 5.0%",
    "CPI + 4.5%",
    "S&P/ASX 100 TR",
    "CPI + 3.0%",
    "FTSE EPRA Nareit Dv ExAUS Rtl TR Hgd AUD",
    "MSCI EM NR AUD",
    "CPI + 2.5%",
    "S&P/ASX 200 A-REIT TR",
    "CPI + 5.5%",
    "MSCI World ex Australia Hdg AUD",
    "RBA Cash Rate Target",
    "FTSE EPRA Nareit Dev Rental Hdg NR AUD",
    "MSCI All Country Asia (ex Japan) Index",
    "Bloomberg AusBond Bank Bill Index",
    "MSCI All Countries World ex Australia Index",
    "MSCI All Countries World ex-Aus (Hedged)",
    "MSCI World Ex Australia Hedged",
    "MSCI All Country World Small Cap ex Australia Index",
    "MSCI ACWI Ex Australia Small Cap Index",
    "S&P / ASX 300 A- REIT Accumulation Index",
    "MSCI Australia 300 Index",
    "Bloomberg barclays global aggregate Index",
    "Bloomberg AUS Bond Bank bill index",
    "MSCI All Country World SMID Cap Net Index in AUD",
    "MSCI Emerging Markets Total Return Net Index (in AUD)",
    "MSCI World Net Total Return Index ex-Australia",
    "MSCI World Small Cap Net Total return (TR) Index in AUD",
    "S&P/ASX Emerging Companies Accumulation Index",
    "S&P/ASX 100 Accumulation Index",
    "S&P/ASX Small Industrials Accumulation Index",
    "MSCI/Mercer Australia Core Wholesale Monthly Property Fund Index",
    "ICE BofA Green Bond Hedged",
    "MSCI ACWI ex Australia Index Net Dividend Withholding Tax (AUD)",
    "EMIX Global Mining Index",
    "FTSE Gold Mines Index return (AUD)",
    "MSCI All Country Asia ex-Japan Net Index in AUD",
    "MSCI All Country World Net Index in AUD",
    "S&P/ASX 200 Accumulation Index",
    "FTSE World Government Bond Index hedged into Australian dollars",
    "S&P 500 Total Return Index (Net Dividends) in AUD",
    "S&P/ASX 200 A-REIT (Sector) (TR) Index",
    "S&P North American Technology Sector Index in AUD",
    "MSCI All Country World Net Index in AUD (for comparison purposes only)",
    "Barclays Global Aggregate Corporate Total Return Index. Hedged to AUD",
    "MSCI World Index",
    "MSCI All Country World Index, incl. net dividends, measured in AUD",
    "FTSE World Government Bond Index hedged to AUD",
    "S&P 500 Total Return Index (Net Dividends) in AUD",
    "S&P/ASX 200 A-REIT (Sector) (TR) Index",
    "MSCI World Index (Total Return Net) Hedged to AUD",
    "Bloomberg Barclays Multiverse Index (Total Return Gross) Hedged to AUD",
    "S&P Global Infrastructure Index A$ Hedged Net Total Return",
    "MSCI Emerging Markets ex-Tobacco ex-Controversial Weapons Net Dividends Reinvested in AUD Index",
    "FTSE EPRA/NAREIT Developed ex Australia Rental Index NTR (AUD hedged)",
    "S&P/ASX 200 Accumulation Index",
    "MSCI All Country World Net Index in AUD",
    "FTSE Developed Core Infrastructure 50/50 Index (AUD hedged) Net TRI",
    "S&P Global Infrastructure Index (Net Total Return $A Hedged)",
    "S&P Global Infrastructure Index A$ Hedged Net Total Return and yield of 10-year Australian Government Bonds",
    "MSCI World Index (Total Return Net) Hedged to AUD",
    "CPI + 2.5% p.a",
    "CPI + 3.5% p.a.",
    "Bloomberg Barclays Global Aggregate Bond Index (AUD Hedged)",
    "MSCI All Country World ex-Australia Index with Net Dividends Reinvested (AUD Hedged)",
    "MSCI All Country World ex-Australia Index with Net Dividends Reinvested (Unhedged)",
    "CPI + 1.0% p.a.",
    "CPI + 4.0% p.a.",
    "CPI + 5.0% p.a.",
    "No Performance fee charge",
    "The higher of the MSCI World ex-Australia Total Return Net Index in AUD and the 10 Year Australian Government Bond Yield",
    "The higher of 6% or the Australian Government 10 year Bond Yield +3.5%",
    "ICE BofA Global High Yield Index (AUD hedged)",
    "Bloomberg Barclays U.S. Aggregate Bond Index (AUD Hedged Total Return)",
    "S&P/ASX 200 All Ordinaries Accumulation Index",
    "S&P/ASX 300 Property Trusts Accum",
    "UBS Composite Bond (All Maturities)",
    "Bloomberg AusBond Bank Bill + 2.5%",
    "FTSE Dvlp Core Infra 50/50 NR Hdg AUD",
    "Bloomberg AusBond BB AUD",
    "FTSE Global Core Infra 50/50 TR Hdg AUD",
    "MSCI World SMID Index (Net) (AUD)",
    "S&P ASX Small Ordinaries Accumulation Index",
    "MSCI ACWI With Special Tax Index (with net dividends reinvested), expressed in AUD",
    "MSCI Asia Ex-Japan (Net, AUD)",
    "MSCI World (Net, AUD)",
    "MSCI Emerging Markets Net Total Return Index (AUD) Unhedged",
    "MSCI Asia Ex Japan Small Companies Index (Unhedged in AUD)",
    "RBA Consumer Price Inflation (CPI) + 3.0%",
    "ASX All Ordinaries Index",
    "Australian Government 10-year Bond Yield",
    "MSCI AC Asia ex Japan (Standard) Index (Net Dividends) in AUD",
    "Bloomberg AusBond Non-Govt 0+ Yr Index",
    "Bloomberg AusBond Composite 0+Yr Index",
    "MSCI Emerging Markets (Standard) Index (Net Dividends) in AUD",
    "FTSE EPRA/NAREIT Developed ex Australia hedged in AUD Net TRI",
    "MSCI ACWI IMI Index in AUD",
    "Bloomberg AusBond Govt 0+Yr Index",
    "Pendal MidCap Custom Index",
    "Australian CPI",
    "MSCI Emerging Markets Net Total Return Index",
    "MSCI ACWI Small Cap Index (net div. reinv.)",
    "Reserve Bank of Australia Official Cash Rate",
    "Bloomberg Global Aggregate Index ($A hedged)",
    "MSCI World ex Australia net total return index (unhedged)",
    "MSCI All Country World Index (with net dividends reinvested) in AUD",
    "FTSE Developed Core Infrastructure Hedged into AUD Index",
    "S2VenturaCapitalStableFundHybridIndex",
    "S2VenturaDiversified50FundHybridIndex",
    "S2VenturaGrowth70FundHybridIndex",
    "MSCI World ex Australia ex Tobacco ex Controversial Weapons Index (AUD) with net dividends reinvested",
    "MSCI World ex Australia (unhedged in $A)",
    "S&P Global infrastructure AUD Net",
    "MSCIWorldexAustraliaIndex,in$Aunhedgedwithnetdividendsreinvested",
    "MSCI World Total Return Index with net dividends in Australian Dollars",
    "MSCI Emerging Markets Index in Australian Dollars",
    "MSCI All Country World Index ex-US Small Cap USD Gross Total Return Index (with gross dividends reinvested, reported in Australian dollars and unhedged)",
    "MSCI All Country World Index ex-Australia (with gross dividends reinvested reported in Australian dollars and unhedged)",
    "MSCI All Country World Index ex-Australia Net Total Return Index 100% hedged to AUD (reported in Australian dollars)",
    "S&P 300 Industrials Accumulation Index"
 ]
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -4,7 +4,7 @@ import time
 from utils.logger import logger
 from copy import deepcopy
 from traceback import print_exc
-
+import utils.benchmark_names
 total_currency_list = [
    "USD",
@ -1034,3 +1034,88 @@ def remove_abundant_data_detail(data_detail_list: list,
        if remove_data in data_detail_list:
            data_detail_list.remove(remove_data)
    return data_detail_list
 def replace_special_table_header(replace_table_header_config: list, page_text: str):
    """
    For some special table header, replace to the standard header
    e.g.
    raw header 1:
    Investment Option \n
    Management \nfee (i) \n(% pa) \n
    Indirect costs (i) \n(% pa) \n
    Estimated performance fees (ii) \n(% pa) \n
    Transaction \ncosts (% pa) \n
    Buy/sell \nspreads (%) \n
    Recoverable \nexpenses (iii) \n
    Estimated \nother \nindirect costs \n
    Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n
    Performance fees \ncharged by \ninterposed \nvehicles \n
    raw header 2:
    Fund \n
    Management \nfee 1 \n(% pa) \n
    Indirect costs1\n(% pa)\n
    Estimated performance fees2\n(% pa)\n
    Transaction \ncosts \n(% pa) \n
    Buy/sell \nspreads (%) \n
    Recoverable \nexpenses 3 \n
    Estimated \nother indirect \ncosts \n
    Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
    Performance \nfees charged \nby interposed \nvehicles \n
    There are 2 layers of headers, the first layer is the main header, the second layer is the sub header
    The purpose is to merge the sub header to the main header
    Indirect costs (i) \n(% pa) replace to  Recoverable expenses\nEstimated other indirect costs
    Estimated performance fees2\n(% pa) replace to Performance fees charged to the Fund by underlying managers\nPerformance fees charged by interposed vehicles
    Remove the second layer header.
    e.g.
    Recoverable \nexpenses (iii) \n
    Estimated \nother \nindirect costs \n
    Performance fees \ncharged to the \nInvestment \nOption by \nunderlying \nmanagers \n
    Performance fees \ncharged by \ninterposed \nvehicles \n
    or
    Recoverable \nexpenses 3 \n
    Estimated \nother indirect \ncosts \n
    Performance \nfees charged \nto the Fund \nby underlying \nmanagers \n
    Performance \nfees charged \nby interposed \nvehicles \n
    """
    if replace_table_header_config is None or len(replace_table_header_config) == 0:
        return page_text
    updated_text = False
    for replace_info in replace_table_header_config:
        for regex_all in replace_info.get("regex_all_list", []):
            table_header_search = re.search(regex_all, page_text)
            if table_header_search is not None:
                original_text = table_header_search.group()
                page_text = re.sub(regex_all, replace_info.get("replace_text", original_text), page_text)
                updated_text = True
                break
        if updated_text:
            break
    # split numbers like 1.320.00 to be 1.32 0.00 by regex
    if re.search(r'(\d)\.(\d{2})(\d)\.(\d{2})', page_text):
        page_text = re.sub(r'(\d)\.(\d{2})(\d)\.(\d{2})', r'\1.\2 \3.\4', page_text)
    return page_text
 def get_bechmark_name(text, search_terms, word_count=300):
    results = []
    try:
        # text = bs(text).get_text()
        for term in search_terms:
            pattern = r'\b' + re.escape(term) + r'\b(?:\s+\S+){0,' + str(word_count) + '}'
            matches = re.finditer(pattern, text, re.IGNORECASE)
            for match in matches:
                results.append(match.group())
    except Exception as e:
        logger.error(f"An error occurred while processing the term '{term}': {e}")
    return " ".join(results)
 #benchmark_name = get_bechmark_name(text, benchmark_names.benchmark_keywords)
--- a/utils/gpt_utils.py
+++ b/utils/gpt_utils.py
@ -6,6 +6,7 @@ import os
 from time import sleep
 import base64
 import dotenv
 import httpx
 # loads .env file with your OPENAI_API_KEY
 dotenv.load_dotenv()
@ -76,7 +77,7 @@ def chat(
        max_tokens = 4096
    client = AzureOpenAI(
-        azure_endpoint=azure_endpoint, api_key=api_key, api_version=api_version
+        azure_endpoint=azure_endpoint, api_key=api_key, api_version=api_version, http_client=httpx.Client(verify=False)
    )
    if (
--- a/utils/pdf_download.py
+++ b/utils/pdf_download.py
@ -4,6 +4,7 @@ import os
 import platform
 from utils.logger import logger
 import dotenv
 import certifi
 # loads .env file with your OPENAI_API_KEY
 dotenv.load_dotenv()
@ -39,8 +40,18 @@ def download_pdf_from_documents_warehouse(pdf_directory: str, doc_id: str):
        if os_name == "windows":
            ACCESS_KEY = os.getenv('ACCESS_KEY')
            SECRET_KEY = os.getenv('SECRET_KEY')
-            session = boto3.Session(aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
+            AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')
-            s3 = session.client('s3')
+            if AWS_SESSION_TOKEN:
                s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), 
                            aws_access_key_id=ACCESS_KEY,
                            aws_secret_access_key=SECRET_KEY,
                            aws_session_token=AWS_SESSION_TOKEN
                            )
            else:
                s3 = boto3.client("s3", region_name="us-east-1", verify=certifi.where(), 
                                aws_access_key_id=ACCESS_KEY,
                                aws_secret_access_key=SECRET_KEY
                                )
        else:
            s3 = boto3.client('s3')
--- a/utils/qwen_utils.py
+++ b/utils/qwen_utils.py
@ -0,0 +1,148 @@
 import requests
 import json
 import os
 from bs4 import BeautifulSoup
 import time
 from time import sleep
 from datetime import datetime
 import pytz
 import pandas as pd
 import dashscope
 import dotenv
 import base64
 dotenv.load_dotenv()
 ali_api_key = os.getenv("ALI_API_KEY_QWEN")
 def chat(
    prompt: str,
    text_model: str = "qwen-plus",
    image_model: str = "qwen-vl-plus",
    image_file: str = None,
    image_base64: str = None,
    enable_search: bool = False,
 ):
    try:
        token = 0
        if (
            image_base64 is None
            and image_file is not None
            and len(image_file) > 0
            and os.path.exists(image_file)
        ):
            image_base64 = encode_image(image_file)
        use_image_model = False
        if image_base64 is not None and len(image_base64) > 0:
            use_image_model = True
            messages = [
                {
                    "role": "user",
                    "content": [
                        {"text": prompt},
                        {
                            "image": f"data:image/png;base64,{image_base64}",
                        },
                    ],
                }
            ]
            count = 0
            while count < 3:
                try:
                    print(f"调用阿里云Qwen模型, 次数: {count + 1}")
                    response = dashscope.MultiModalConversation.call(
                        api_key=ali_api_key,
                        model=image_model,
                        messages=messages,
                    )
                    if response.status_code == 200:
                        break
                    else:
                        print(f"调用阿里云Qwen模型失败: {response.code} {response.message}")
                        count += 1
                        sleep(2)
                except Exception as e:
                    print(f"调用阿里云Qwen模型失败: {e}")
                    count += 1
                    sleep(2)
            if response.status_code == 200:
                image_text = (
                    response.get("output", {})
                    .get("choices", [])[0]
                    .get("message", {})
                    .get("content", "")
                )
                temp_image_text = ""
                if isinstance(image_text, list):
                    for item in image_text:
                        if isinstance(item, dict):
                            temp_image_text += item.get("text", "") + "\n\n"
                        elif isinstance(item, str):
                            temp_image_text += item + "\n\n"
                        else:
                            pass
                response_contents = temp_image_text.strip()
                token = response.get("usage", {}).get("total_tokens", 0)
            else:
                response_contents = f"{response.code} {response.message} 无法分析图片"
                token = 0
        else:
            messages = [{"role": "user", "content": prompt}]
            count = 0
            while count < 3:
                try:
                    print(f"调用阿里云Qwen模型, 次数: {count + 1}")
                    response = dashscope.Generation.call(
                        api_key=ali_api_key,
                        model=text_model,
                        messages=messages,
                        enable_search=enable_search,
                        search_options={"forced_search": enable_search},  # 强制联网搜索
                        result_format="message",
                    )
                    if response.status_code == 200:
                        break
                    else:
                        print(f"调用阿里云Qwen模型失败: {response.code} {response.message}")
                        count += 1
                        sleep(2)
                except Exception as e:
                    print(f"调用阿里云Qwen模型失败: {e}")
                    count += 1
                    sleep(2)
            # 获取response的token
            if response.status_code == 200:
                response_contents = (
                    response.get("output", {})
                    .get("choices", [])[0]
                    .get("message", {})
                    .get("content", "")
                )
                token = response.get("usage", {}).get("total_tokens", 0)
            else:
                response_contents = f"{response.code} {response.message}"
                token = 0
        result = {}
        if use_image_model:
            result["model"] = image_model
        else:
            result["model"] = text_model
        result["response"] = response_contents
        result["prompt_token"] = response.get("usage", {}).get("input_tokens", 0)
        result["completion_token"] = response.get("usage", {}).get("output_tokens", 0)
        result["total_token"] = token
        sleep(2)
        return result, False
    except Exception as e:
        print(f"调用阿里云Qwen模型失败: {e}")
        return {}, True
 def encode_image(image_path: str):
    if image_path is None or len(image_path) == 0 or not os.path.exists(image_path):
        return None
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")
--- a/yml/aus_prospectus.yml
+++ b/yml/aus_prospectus.yml
@ -0,0 +1,27 @@
 Example to extract data from Australia Prospectus PDF Document.
 Sample:
  {
    "doc_id": "412778803"
  }
 Author: Blade He
 ---
 parameters:
  - name: Australia Prospectus Document Id
    in: body
    type: string
    required: true
    description: Example to extract data from Australia Prospectus PDF Document.
    default: {"doc_id": "412778803"}
    schema:
      required:
        - Document Id
      properties:
        doc_id:
          description: Australia Prospectus Document Id
          required: true
          type: string
 responses:
  200:
    description: succesfully.
  400:
    description: failed.
Author	SHA1	Message	Date
blade	7b0c825a39	fix issue	2025-11-12 14:07:55 +08:00
blade	ea81197bcd	update for apply ALI QWEN as Demo	2025-11-11 13:33:57 +08:00
blade	255752c848	add mini_main.py	2025-11-10 16:55:55 +08:00
Blade He	37cf06a394	Confirm span pages calculation, the management fee and costs page only with management_fee_and_costs and management_fee datapoints	2025-04-03 18:08:27 -05:00
Blade He	f333cc30f5	1. fit the scenario when document type is not 1 or 4, 5 2. support the scenario: "investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.	2025-04-03 17:06:43 -05:00
Blade He	4b896f4460	update latest metrics based on optimized matching algorithm Support: 2663 -> 2773 percentage of Share Matched: 80.08 -> 82.59 F1: 0.956 -> 0.943	2025-04-02 20:39:31 -05:00
Blade He	427a379b3b	1. support re-call ChatGPT API to match non-matched prediction fund/ share names 2. If document fund amount less than 3, cancel the production name judgment logic	2025-04-02 16:34:41 -05:00
Blade He	4cee95db9a	fix issue for post actions	2025-03-31 22:04:31 -05:00
Blade He	50e51e0894	recover main.py	2025-03-31 17:16:05 -05:00
Blade He	a42033f848	Merge branch 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi	2025-03-31 17:09:06 -05:00
Blade He	984c686bf3	support separate tables and pages data which with specific biz rules	2025-03-31 17:08:49 -05:00
Russell Spence	ac6332ad46	Merge branch 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi	2025-03-28 08:36:58 -05:00
Russell Spence	f7b9652c75	gitignore a virtual environment directory	2025-03-28 08:36:44 -05:00
Blade He	355b145cf7	If found total_annual_dollar_based_charges and could be divisible by 52 or 12, then set the fund name and share name to be document production name	2025-03-28 01:33:33 -05:00
Blade He	46f86b124b	update instructions fund name section structure	2025-03-28 00:51:51 -05:00
Blade He	8a5723c150	optimize for Entry Fee/ Nil Entry case	2025-03-27 21:10:33 -05:00
Blade He	d925992326	1. Support the keywords of complex special cases to be regex 2. Support set sub-datapoints list to complex special cases node. 3. Simplify the common management fee and costs instructions. 4. Add markdown title characters: ## or ### to instructions.	2025-03-27 16:00:19 -05:00
Blade He	dc560e1e01	update metrics	2025-03-26 23:14:28 -05:00
Blade He	ff2325c72d	1. fix issue for assign values based on production name 2. optimize instructions for extract non-necessary data by Cost of Product message	2025-03-26 18:58:45 -05:00
Blade He	8ad472fb39	UPDATE metrics code file	2025-03-24 18:00:53 -05:00
Blade He	dd1f8f76ae	update for metrics	2025-03-24 17:12:13 -05:00
Blade He	4edc4b4768	clean code	2025-03-24 17:10:16 -05:00
Blade He	9be6d1296d	update benchmark check logic	2025-03-19 00:52:25 -05:00
Blade He	5ba39a394b	1. keep fund/ share db list before applying LLM 2. add key words for interposed_vehicle_performance_fee_cost	2025-03-18 22:15:31 -05:00
Blade He	c71936c5ff	1. optimize benchmark_name instructions 2. consider possible with multiple same raw fund names in documents, not to remove unmatched_db_list when match relevant raw fund/ share name Otherwise, it will occur some raw names couldn't match db name issue.	2025-03-18 17:22:21 -05:00
Blade He	0cea2e501b	For AUS Prospectus, cancel visiting Vision ChatGPT when page contents without any numeric text or perhaps with messy code. (But should keep this logic for EMEA LUX AR, because of some special providers cases for this market documents.)	2025-03-18 14:15:43 -05:00
Ravi Maheshwari	6614972849	Raw Code added to identify benchmark names	2025-03-18 18:57:08 +05:30
Ravi Maheshwari	0ad17e338e	Code added to save anomilities	2025-03-18 17:56:50 +05:30
Ravi Maheshwari	2817490652	Code added to save anomilities	2025-03-18 17:54:33 +05:30
Ravi Maheshwari	ad371f6584	Changed Performance matrix code to get all anomilities to analyze and Prompt to get better accuracy	2025-03-18 16:43:55 +05:30
Blade He	b3941ee4b3	update instructions for total_annual_dollar_based_charges	2025-03-17 15:07:02 -05:00
Ravi Maheshwari	0ce604021c	updating gitignore	2025-03-17 18:52:49 +05:30
Ravi Maheshwari	dc9180ca1b	Rollback file name	2025-03-17 17:09:48 +05:30
Ravi Maheshwari	af3d1222a6	Changes done for Bugfix: 1. SSL issue \n2. Ignore Example Tables \n3. Performacne fee	2025-03-17 17:07:08 +05:30
Blade He	dd15c1c48e	Optimize for benchmark name	2025-03-14 11:51:10 -05:00
Blade He	bceff71fa4	Set re-run parameters to be True re_run_extract_data = True re_run_mapping_data = True force_save_total_data = True	2025-03-14 04:03:22 -05:00
Blade He	0f65537478	optimize instructions for minimum_initial_investment	2025-03-14 04:02:15 -05:00
Blade He	f539340d04	1. optimize instructions Only load relevant fund name for investment objective, instead of full page text with the most recent investment objective 2. Exclude the table which with only one numeric column: Cost Product	2025-03-14 01:04:51 -05:00
Blade He	551f754379	Fix issue when saving data extraction data	2025-03-13 18:36:04 -05:00
Blade He	a48af9ddf0	A. Metrics score Blade's updates 1. Set the secondary key to be the share class name, instead of the fund name 2. Remove the data point which support is 0 to calculate the metrics 3. Add the message list to store the error message 4. Support save metrics/ error message to excel file 5. Support statistics for different document list 6. Set F1-Score to the first column in the metrics table B. Optimize instructions for benchmark_name	2025-03-13 17:52:06 -05:00
Blade He	a090b5cc9e	1. metrics's key should be share class name: sec_name 2. support output metrics data as Excel file 3. Optimize instructions for performance_fee_costs	2025-03-13 11:53:27 -05:00
Ravi Maheshwari	1f6b781b12	Merge branches 'aus_prospectus_ravi' and 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi	2025-03-13 17:34:35 +05:30
Ravi Maheshwari	97da7e4961	Added code to identify anomaly cases and performance matrix and updated for pdf downloading code	2025-03-13 17:31:54 +05:30
Blade He	fd2430082c	optimize instructions for management_fee_and_costs and buy_spread, sell_spread	2025-03-13 02:59:19 -05:00
Ravi Maheshwari	336fd9a24f	Merge branch 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi	2025-03-13 11:39:30 +05:30
Blade He	fb5dda2170	1. optimize performance_fee_costs prompts 2. support calculate metrics by zero equal with empty	2025-03-12 23:45:52 -05:00
Blade He	c2c0b33015	align fund name based on production name optimize performance relevant prompts	2025-03-12 21:52:00 -05:00
Blade He	6f17c2253c	optimize instructions for document 412778803	2025-03-12 17:24:39 -05:00
Blade He	765772e5a8	optimize performance_fee_costs by document 391080133	2025-03-12 14:45:48 -05:00
Ravi Maheshwari	76fbb7c071	Merge branches 'aus_prospectus_ravi' and 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi	2025-03-12 14:16:27 +05:30
Blade He	c7c36dbdd2	1. update performance_fee name to performance_fee_costs 2. support extract data for total_annual_dollar_based_charges	2025-03-11 17:15:39 -05:00
Blade He	b7506c78f3	Add API code file	2025-03-10 16:00:17 -05:00
Blade He	e9f6383258	apply configuration file to replace disorder table header contents	2025-03-10 11:09:00 -05:00
Blade He	2548606ccc	a little change	2025-03-10 08:20:01 -05:00
Blade He	604ab326a7	a little change	2025-03-08 21:50:44 -06:00
Blade He	4ee762963e	optimized for management_fee_and_costs and administration_fees	2025-03-08 21:40:00 -06:00
Blade He	fa2dede454	optimize for management_fee_and_costs and management_fee	2025-03-07 18:38:36 -06:00
Blade He	2cd4f5f787	Supplement provider information to ground truth data Calculate metrics based on providers Integrate "merge" data algorithm for AUS Prospectus final outputs	2025-03-07 15:02:12 -06:00
Blade He	52515fc152	1. simplify management_fee_and_costs instructions 2. optimize management_fee_and_costs instructions 3. resolve the issues for complex scenarios: need sum management_fee, recoverable_expenses, indirect_costs as management_fee_and_costs	2025-03-06 17:27:18 -06:00
Blade He	c4ed65770d	Try to support more complex management_fee_and_costs scenarios Support calculate all of data points metrics	2025-03-05 17:21:13 -06:00
Blade He	cd7e09757d	check in calc_metrics to repo.	2025-03-05 09:57:02 -06:00
Ravi Maheshwari	fdcb4b2ec0	Merge branch 'main' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi	2025-03-05 12:01:12 +05:30
Blade He	d00820c14d	update AUS Prospectus data point configurations	2025-03-04 16:52:06 -06:00
Blade He	f4b4d00f58	optimize instructions for management fee and costs. support dynamic loading complex instructions by keywords	2025-03-04 08:32:55 -06:00
Blade He	d3be711859	optimize administration fees instructions	2025-02-28 22:12:18 -06:00
Blade He	d4bc3aba4e	optimize for management fees	2025-02-28 16:55:33 -06:00
Blade He	d0295995d8	support judge whether next page contents with same structure table as current page. If yes, handle next page data extraction pipeline.	2025-02-27 23:08:57 -06:00
Blade He	d0128d6279	1. optimize for administration fees. 2. optimize for management fees	2025-02-27 17:36:41 -06:00
Blade He	543cab74e1	1. get production name 2. if some data point with production name, set each fund/ share with relevant data point value(s)	2025-02-27 12:07:49 -06:00
Blade He	412692e1c4	update keywords for management fee and costs	2025-02-27 08:34:46 -06:00
Blade He	70079d176e	Support remove duplicated values to keep the values to be the latest ones.	2025-02-26 17:05:58 -06:00
Blade He	f467945cd4	support benchmark name data extraction	2025-02-26 10:05:46 -06:00
Blade He	357bb6d580	1. support dynamic show fund level data examples. 2. optimize for minimum_initial_investment data point	2025-02-25 10:35:53 -06:00
Blade He	e60e1fd546	move configuration files for all datapoints to "all_datapoints" folder	2025-02-24 15:23:16 -06:00
Blade He	590f7e2249	1. backup data points configurations 2. simplify data points configurations for important 11 data points.	2025-02-24 15:21:32 -06:00
Blade He	75ea383354	support identify aus prospectus document category: MIS or Super	2025-02-24 15:08:15 -06:00
Blade He	bb6862b179	update a little	2025-02-19 14:32:08 -06:00
Blade He	705933bbdd	optimized for phase 2 data	2025-02-18 18:52:26 -06:00
Blade He	353bc28599	update a little	2025-02-11 11:49:53 -06:00
Blade He	01e2a0e38d	add configuration for datapoints data types update configuration for minimum initial investment support apply value to all of funds for minimum initial investment	2025-02-05 12:08:12 -06:00