Compare commits

..

19 Commits

Author SHA1 Message Date
blade 255752c848 add mini_main.py 2025-11-10 16:55:55 +08:00
Blade He 37cf06a394 Confirm span pages calculation, the management fee and costs page only with management_fee_and_costs and management_fee datapoints 2025-04-03 18:08:27 -05:00
Blade He f333cc30f5 1. fit the scenario when document type is not 1 or 4, 5
2. support the scenario:
"investment fees and costs including performance" statement in performance fee data page, instead of in management fee and costs data page.
2025-04-03 17:06:43 -05:00
Blade He 4b896f4460 update latest metrics based on optimized matching algorithm
Support: 2663 -> 2773
percentage of Share Matched: 80.08 -> 82.59

F1: 0.956 -> 0.943
2025-04-02 20:39:31 -05:00
Blade He 427a379b3b 1. support re-call ChatGPT API to match non-matched prediction fund/ share names
2. If document fund amount less than 3, cancel the production name judgment logic
2025-04-02 16:34:41 -05:00
Blade He 4cee95db9a fix issue for post actions 2025-03-31 22:04:31 -05:00
Blade He 50e51e0894 recover main.py 2025-03-31 17:16:05 -05:00
Blade He a42033f848 Merge branch 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi 2025-03-31 17:09:06 -05:00
Blade He 984c686bf3 support separate tables and pages data which with specific biz rules 2025-03-31 17:08:49 -05:00
Russell Spence ac6332ad46 Merge branch 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi 2025-03-28 08:36:58 -05:00
Blade He 355b145cf7 If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
then set the fund name and share name to be document production name
2025-03-28 01:33:33 -05:00
Blade He 46f86b124b update instructions fund name section structure 2025-03-28 00:51:51 -05:00
Blade He 8a5723c150 optimize for Entry Fee/ Nil Entry case 2025-03-27 21:10:33 -05:00
Blade He d925992326 1. Support the keywords of complex special cases to be regex
2. Support set sub-datapoints list to complex special cases node.
3. Simplify the common management fee and costs instructions.
4. Add markdown title characters: ## or ### to instructions.
2025-03-27 16:00:19 -05:00
Blade He dc560e1e01 update metrics 2025-03-26 23:14:28 -05:00
Blade He ff2325c72d 1. fix issue for assign values based on production name
2. optimize instructions for extract non-necessary data by Cost of Product message
2025-03-26 18:58:45 -05:00
Blade He 8ad472fb39 UPDATE metrics code file 2025-03-24 18:00:53 -05:00
Blade He dd1f8f76ae update for metrics 2025-03-24 17:12:13 -05:00
Blade He 4edc4b4768 clean code 2025-03-24 17:10:16 -05:00
15 changed files with 1586 additions and 438 deletions

1
.gitignore vendored
View File

@ -16,3 +16,4 @@
/performance.ipynb /performance.ipynb
/sample_documents/special_cases.txt /sample_documents/special_cases.txt
/aus-prospectus/ /aus-prospectus/
/output/log/*.log

View File

@ -114,25 +114,6 @@ def calc_metrics(ground_truth_file: str, prediction_file: str):
print(f"Accuracy TOR: {accuracy_tor}") print(f"Accuracy TOR: {accuracy_tor}")
def transform_pdf_2_image():
"""
Transform pdf to image.
"""
import fitz
folder = r"/Users/bhe/OneDrive - MORNINGSTAR INC/Personal Document/US_Life/pay/"
pdf_file = r"Pay_Date_2025-02-14.pdf"
pdf_path = os.path.join(folder, pdf_file)
pdf_doc = fitz.open(pdf_path)
pdf_file_pure_name = pdf_file.replace(".pdf", "")
for page_num in range(pdf_doc.page_count):
page = pdf_doc.load_page(page_num)
image = page.get_pixmap(dpi=300)
image_path = os.path.join(folder, f"{pdf_file_pure_name}_{page_num}.png")
image.save(image_path)
def invoke_api_demo(doc_id: str = "407881493"): def invoke_api_demo(doc_id: str = "407881493"):
headers = {"connection": "keep-alive", "content-type": "application/json"} headers = {"connection": "keep-alive", "content-type": "application/json"}
data = { data = {
@ -1432,7 +1413,7 @@ def merge_inference_data():
if __name__ == "__main__": if __name__ == "__main__":
merge_inference_data() # merge_inference_data()
# adjust_column_order() # adjust_column_order()
# set_mapping_to_data_side_documents_data() # set_mapping_to_data_side_documents_data()
@ -1481,7 +1462,6 @@ if __name__ == "__main__":
# calculate_metrics_based_audit_file(is_strict=False) # calculate_metrics_based_audit_file(is_strict=False)
# remove_ter_ogc_performance_fee_annotation() # remove_ter_ogc_performance_fee_annotation()
# batch_run_documents() # batch_run_documents()
# transform_pdf_2_image()
# ground_truth_file = "./test_metrics/ground_truth.xlsx" # ground_truth_file = "./test_metrics/ground_truth.xlsx"
# prediction_file = "./test_metrics/prediction.xlsx" # prediction_file = "./test_metrics/prediction.xlsx"
# calc_metrics(ground_truth_file, prediction_file) # calc_metrics(ground_truth_file, prediction_file)

View File

@ -0,0 +1,42 @@
{
"management_fee_including_performance_fee": {
"details": [
{"regex_text": ["investment\\s*fees\\s*and\\s*costs\\W*including\\s*performance\\s*fees"],
"effective_datapoints": ["management_fee_and_costs"],
"exclude_datapoints": ["performance_fee_costs"]},
{"regex_text": ["Investment\\s*fees\\s*and\\s*costs\\s*includ.*performance\\s*fees"],
"effective_datapoints": ["performance_fee_costs"],
"exclude_datapoints": ["management_fee_and_costs"]}
],
"provider_ids": ["0C00006CX6", "0C000056BP", "0C000056KJ", "0C000056KK",
"0C000069VJ", "0C0000AL58", "0C00006B9E",
"0C00006BDB", "0C00006BDD", "0C00006BDG",
"0C000035YC", "0C0000CSKN", "0C00005549",
"0C000051C6", "0C00008JA0", "0C000093Z4",
"0C0000B5L6", "0C00006EGK", "0C00006EJI",
"0C00006FYL", "0C00006G0Q", "0C00006GIF",
"0C00006GNW", "0C00006GPU", "0C00006H46",
"0C00006H4J", "0C00006H4Q", "0C0000A5XQ",
"0C0000BBPL", "0C0000C2MS", "0C0000CVRL",
"0C0000AV6P", "0C00001XXQ", "0C00001XYR",
"0C00006AZB", "0C00006BN6", "0C00006BXE",
"0C00006CIK", "0C00006CJ2", "0C00006DOA",
"0C0000CAQF", "0C0000CAQH", "0C0000CAQO",
"0C0000CAQR"],
"provider_names": ["Bh Super Pty Ltd", "Mellett Super Pty Ltd", "LQ Super Pty Ltd",
"Q Super Pty Ltd", "RPM Super Pty Ltd", "VicSuper Pty Ltd",
"RMK Super Pty Ltd", "CCM Super Pty Ltd", "Judd Super Pty Ltd",
"JMJ Super Pty Ltd", "CARE Super Pty Ltd", "AvSuper Pty Ltd",
"Vision Super Pty Ltd", "AustralianSuper Pty Ltd", "First Super Pty Ltd",
"GeoVet Super Pty Ltd", "Gilby Super Pty Ltd", "Incani & Papadopoulos Super Pty Ltd",
"Gardner Super Pty Ltd", "Terlet Super Pty Ltd", "Rizzo Super Pty Ltd",
"Mellet Super Pty Ltd", "Smithley Super Pty Ltd", "Snowflake Super Pty Ltd",
"Fruitful Super Pty Ltd", "Seawell Super Pty Ltd", "St Super Pty Ltd",
"Christian Super Pty Ltd", "SCS Super Pty Ltd", "Aware Super Pty Ltd",
"Vanguard Super Pty Ltd", "United Super Pty Ltd", "National Australia Super Pty Ltd",
"AGEST Super Pty Ltd", "Huoncan Super Pty Ltd", "JHG Super Pty Ltd",
"Telstra Super Pty Ltd", "P & M Bellero Super Pty Ltd", "J J N A Super Pty Ltd",
"KSL Super Pty Ltd", "NESS Super Pty Ltd", "Prime Super Pty Ltd",
"PostSuper Pty Ltd", "Legal Super Pty Ltd"]
}
}

View File

@ -628,16 +628,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list)
cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list)
prompt_context = f""" prompt_context = f"""
{prompt_instruction} {prompt_instruction}
provider_name: {provider_name} provider_name: {provider_name}
prediction_fund: prediction_fund:
{cleaned_unmatched_pred_list} {cleaned_unmatched_pred_list}
true_fund: true_fund:
{cleaned_unmatched_db_list} {cleaned_unmatched_db_list}
""" """
# print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list) # print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list)
# print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list) # print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list)
# llm_response = get_llm_response(prompt_context) # llm_response = get_llm_response(prompt_context)
@ -653,67 +653,114 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc
llm_result = json_repair.loads(llm_response['response']) llm_result = json_repair.loads(llm_response['response'])
except: except:
llm_result = {} llm_result = {}
# try: unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
# llm_result = ast.literal_eval(llm_response['response'].replace('\n','')) unmatched_pred_list,
# except Exception as e: cleaned_unmatched_pred_list,
# logger.info(f"error: {e}") unmatched_db_list,
# cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '') cleaned_unmatched_db_list,
# llm_result = json.loads(cleaned_response) df_data,
# logger.info(f"\n\n llm_result: {llm_result}") final_result,
for k,v in llm_result.items(): record_empty=False)
# print("k: ",k) """
# print("v: ",v) For some cases, same document,
og_db_index=-1 perhaps same funds/ shares are with different raw names in different pages.
# og_pred_index = -1 e.g. High Growth Fund in page 8, Vision High Growth Fund in page 10, and they are same fund.
og_pred_index_list = [] But if only call ChatGPT API one time, it will not be able to match all of them.
if k in cleaned_unmatched_pred_list: """
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): if len(unmantched_pred_index_list)>0:
if c_item==k: unmatched_pred_list = [unmatched_pred_list[i] for i in unmantched_pred_index_list]
og_pred_index_list.append(c_idx) cleaned_unmatched_pred_list = [cleaned_unmatched_pred_list[i] for i in unmantched_pred_index_list]
# og_pred_index = cleaned_unmatched_pred_list.index(k) prompt_context = f"""
{prompt_instruction}
if len(og_pred_index_list) == 0: provider_name: {provider_name}
# sometimes, the raw name and db name reversed from the LLM response
if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list:
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==v:
og_pred_index_list.append(c_idx)
# og_pred_index = cleaned_unmatched_pred_list.index(v)
og_db_index = cleaned_unmatched_db_list.index(k)
# v and k are swapped
temp = v
v = k
k = temp
if len(og_pred_index_list)==0:
continue
# og_db_index = cleaned_unmatched_db_list.index(v)
if og_db_index == -1 and v in cleaned_unmatched_db_list:
og_db_index = cleaned_unmatched_db_list.index(v)
# print("og_db_index: ",og_db_index, cleaned_unmatched_db_list)
# print("unmatched_db_list: ",unmatched_db_list)
for i in df_data: prediction_fund:
for og_pred_index in og_pred_index_list: {cleaned_unmatched_pred_list}
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index]
i['cleaned_db_fund_name'] = v
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else:
i['db_fund'] = ''
i['cleaned_db_fund_name'] = ''
final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_pred_fund'] = k
i['llm_matched_db_name'] = v
i['llm_result'] = llm_result
break
true_fund:
{cleaned_unmatched_db_list}
"""
llm_response, with_error = chat(
prompt=prompt_context, system_prompt=system_prompt, response_format={"type": "json_object"}
)
# logger.info(f"fund matching LLM Response: {llm_response}")
if 'response' in llm_response.keys():
try:
llm_result = json.loads(llm_response['response'])
except:
try:
llm_result = json_repair.loads(llm_response['response'])
except:
llm_result = {}
unmantched_pred_index_list = post_handle_fund_matching_call(llm_result,
unmatched_pred_list,
cleaned_unmatched_pred_list,
unmatched_db_list,
cleaned_unmatched_db_list,
df_data,
final_result,
record_empty=True)
# break
return final_result return final_result
def post_handle_fund_matching_call(llm_result,
unmatched_pred_list,
cleaned_unmatched_pred_list,
unmatched_db_list,
cleaned_unmatched_db_list,
df_data,
final_result,
record_empty: bool = False):
unmantched_pred_index_list = []
for pred_name,db_name in llm_result.items():
og_db_index=-1
og_pred_index_list = []
if pred_name in cleaned_unmatched_pred_list:
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==pred_name:
og_pred_index_list.append(c_idx)
if len(og_pred_index_list) == 0:
# sometimes, the raw name and db name reversed from the LLM response
if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list:
for c_idx, c_item in enumerate(cleaned_unmatched_pred_list):
if c_item==db_name:
og_pred_index_list.append(c_idx)
og_db_index = cleaned_unmatched_db_list.index(pred_name)
# v and k are swapped
temp = db_name
db_name = pred_name
pred_name = temp
if len(og_pred_index_list)==0:
continue
if og_db_index == -1 and db_name in cleaned_unmatched_db_list:
og_db_index = cleaned_unmatched_db_list.index(db_name)
for i in df_data:
for og_pred_index in og_pred_index_list:
if i['pred_fund']==unmatched_pred_list[og_pred_index]:
if og_db_index!=-1:
i['db_fund']=unmatched_db_list[og_db_index]
i['cleaned_db_fund_name'] = db_name
final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]})
else:
unmantched_pred_index_list.append(og_pred_index)
i['db_fund'] = ''
i['cleaned_db_fund_name'] = ''
if record_empty:
final_result.update({unmatched_pred_list[og_pred_index]:""})
i['llm_clean_pred_list'] = cleaned_unmatched_pred_list
i['llm_clean_db_list'] = cleaned_unmatched_db_list,
i['llm_pred_fund'] = pred_name
i['llm_matched_db_name'] = db_name
i['llm_result'] = llm_result
break
return unmantched_pred_index_list
def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names): def api_for_fund_matching_call(doc_id, api_response, providerName, all_investment_db_names):
result = api_response['data'] result = api_response['data']
doc_fund_names = [item['fund_name'] for item in result] doc_fund_names = [item['fund_name'] for item in result]

View File

@ -11,7 +11,7 @@ from utils.sql_query_util import query_document_fund_mapping, query_investment_b
from utils.logger import logger from utils.logger import logger
from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \ from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \
get_most_similar_name, remove_abundant_data, replace_special_table_header get_most_similar_name, remove_abundant_data, replace_special_table_header
from utils.similarity import Similarity
class DataExtraction: class DataExtraction:
def __init__( def __init__(
@ -75,11 +75,33 @@ class DataExtraction:
self.datapoint_type_config = self.get_datapoint_type() self.datapoint_type_config = self.get_datapoint_type()
self.datapoint_name_config = self.get_datapoint_name() self.datapoint_name_config = self.get_datapoint_name()
self.replace_table_header_config = self.get_replace_table_header_config() self.replace_table_header_config = self.get_replace_table_header_config()
self.special_datapoint_feature_config = self.get_special_datapoint_feature_config()
self.special_datapoint_feature = self.init_special_datapoint_feature()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \ self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name() self.get_datapoint_reported_name()
self.extract_way = extract_way self.extract_way = extract_way
self.output_image_folder = output_image_folder self.output_image_folder = output_image_folder
def get_special_datapoint_feature_config(self) -> dict:
special_datapoint_feature_config_file = os.path.join(self.configuration_folder, "special_datapoint_feature.json")
if not os.path.exists(special_datapoint_feature_config_file):
return {}
special_datapoint_feature_config = {}
with open(special_datapoint_feature_config_file, "r", encoding="utf-8") as f:
special_datapoint_feature_config = json.load(f)
return special_datapoint_feature_config
def init_special_datapoint_feature(self) -> dict:
special_datapoint_feature = {}
if self.special_datapoint_feature_config is None or \
len(list(self.special_datapoint_feature_config.keys())) == 0:
return special_datapoint_feature
for feature in list(self.special_datapoint_feature_config.keys()):
special_datapoint_feature[feature] = {}
return special_datapoint_feature
def get_document_category_production(self): def get_document_category_production(self):
document_category = None document_category = None
document_production = None document_production = None
@ -131,7 +153,6 @@ class DataExtraction:
pass pass
return fund_name return fund_name
def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict: def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict:
""" """
If document source is aus_propectus and document category is MIS If document source is aus_propectus and document category is MIS
@ -273,28 +294,25 @@ class DataExtraction:
def post_supplement_data(self, data_list: list) -> list: def post_supplement_data(self, data_list: list) -> list:
""" """
data_dict = {"doc_id": self.doc_id} Post supplement data for the extracted data
data_dict["page_index"] = page_num
data_dict["datapoints"] = ", ".join(page_datapoints)
data_dict["page_text"] = page_text
data_dict["instructions"] = instructions
data_dict["raw_answer"] = response
data_dict["extract_data"] = data
data_dict["extract_way"] = original_way
data_dict["prompt_token"] = result.get("prompt_token", 0)
data_dict["completion_token"] = result.get("completion_token", 0)
data_dict["total_token"] = result.get("total_token", 0)
""" """
data_list = self.check_benchmark(data_list) try:
data_list = self.supplement_ttr_pension(data_list) data_list = self.check_benchmark(data_list)
data_list = self.align_fund_share_name(data_list) data_list = self.supplement_ttr_pension(data_list)
data_list = self.supplement_minimum_initial_investment(data_list) data_list = self.align_fund_share_name(data_list)
data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list) data_list = self.supplement_minimum_initial_investment(data_list)
data_list = self.remove_duplicate_data(data_list) data_list = self.check_total_annual_dollar_based_charges(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name: data_list, datapoint_list_with_production_name = self.post_adjust_for_value_with_production_name(data_list)
data_list = self.post_adjust_management_fee_costs(data_list) data_list = self.remove_duplicate_data(data_list)
if "management_fee" not in datapoint_list_with_production_name and "management_fee_and_costs" not in datapoint_list_with_production_name:
data_list, adjust = self.post_management_fee_exclude_performance_fee(data_list)
if not adjust:
data_list = self.post_adjust_management_fee_costs(data_list)
data_list = self.check_administration_fees(data_list) data_list = self.check_administration_fees(data_list)
except Exception as e:
logger.error(f"Post supplement data error: {e}")
print_exc()
return data_list return data_list
def check_benchmark(self, data_list: list): def check_benchmark(self, data_list: list):
@ -322,6 +340,8 @@ class DataExtraction:
if "benchmark_name" not in keys: if "benchmark_name" not in keys:
continue continue
benchmark_name = data_item.get("benchmark_name", "") benchmark_name = data_item.get("benchmark_name", "")
if len(benchmark_name) == 0:
continue
if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \ if benchmark_name.startswith("A range") or benchmark_name.startswith("The fund") or \
benchmark_name.startswith("CPI "): benchmark_name.startswith("CPI "):
data_item.pop("benchmark_name") data_item.pop("benchmark_name")
@ -394,11 +414,22 @@ class DataExtraction:
fund_name = data_item.get("fund_name", "") fund_name = data_item.get("fund_name", "")
if len(fund_name) == 0: if len(fund_name) == 0:
continue continue
share_name = data_item.get("share_name", "")
updated_fund_name = self.update_pension_ttr_fund_name(fund_name)
if updated_fund_name != fund_name:
fund_name = updated_fund_name
data_item["fund_name"] = fund_name
updated_share_name = self.update_pension_ttr_fund_name(share_name)
if updated_share_name != share_name:
share_name = updated_share_name
data_item["share_name"] = share_name
fund_name_splits = fund_name.split() fund_name_splits = fund_name.split()
if fund_name_splits[-1] == "TTR": if fund_name_splits[-1] == "TTR" and fund_name not in ttr_fund_name_list:
ttr_fund_name_list.append(fund_name) ttr_fund_name_list.append(fund_name)
exist_ttr = True exist_ttr = True
if fund_name_splits[-1] == "Pension": if fund_name_splits[-1] == "Pension" and fund_name not in pension_fund_name_list:
pension_fund_name_list.append(fund_name) pension_fund_name_list.append(fund_name)
exist_pension = True exist_pension = True
if exist_ttr and exist_pension: if exist_ttr and exist_pension:
@ -449,6 +480,22 @@ class DataExtraction:
data.extend(new_item_list) data.extend(new_item_list)
return data_list return data_list
def update_pension_ttr_fund_name(self, investment_name: str):
pension_prefix_list = ["retirement account", "account-based pension", "account based pension"]
ttr_prefix_list = ["transition to retirement account", "pre-retirement pension", "pre retirement pension"]
investment_name_lower = investment_name.lower()
for pension_prefix in pension_prefix_list:
if investment_name_lower.startswith(pension_prefix) and investment_name_lower != pension_prefix:
pension_prefix_split = pension_prefix.split()
investment_name = " ".join(investment_name.split()[len(pension_prefix_split):]) + " Pension"
break
for ttr_prefix in ttr_prefix_list:
if investment_name_lower.startswith(ttr_prefix) and investment_name_lower != ttr_prefix:
ttr_prefix_split = ttr_prefix.split()
investment_name = " ".join(investment_name.split()[len(ttr_prefix_split):]) + " TTR"
break
return investment_name
def check_administration_fees(self, data_list: list): def check_administration_fees(self, data_list: list):
""" """
If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list If document source is aus_prospectus and document category is MIS, then remove the administration fees from data_list
@ -476,15 +523,49 @@ class DataExtraction:
pass pass
return data_list return data_list
def check_total_annual_dollar_based_charges(self, data_list: list):
"""
If found total_annual_dollar_based_charges and could be divisible by 52 or 12,
then set the fund name and share name to be document production name.
"""
for data_dict in data_list:
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
found = False
for data_item in data:
keys = list(data_item.keys())
fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "")
if len(fund_name) == 0:
continue
if "total_annual_dollar_based_charges" in keys:
value = data_item.get("total_annual_dollar_based_charges", -1)
if len(str(value)) > 0:
value_divide_52 = value / 52
value_divide_12 = value / 12
if (value_divide_52 == round(value_divide_52, 4)) or \
(value_divide_12 == round(value_divide_12, 4)):
data_item["fund_name"] = self.document_production
data_item["share_name"] = self.document_production
found = True
break
if found:
break
return data_list
def post_adjust_for_value_with_production_name(self, data_list: list): def post_adjust_for_value_with_production_name(self, data_list: list):
""" """
If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value. If some datapoint with production name, then each fund/ share class in the same document for the datapoint should be with same value.
""" """
if len(self.fund_name_list) < 3:
return data_list, []
raw_name_dict = self.get_raw_name_dict(data_list) raw_name_dict = self.get_raw_name_dict(data_list)
raw_name_list = list(raw_name_dict.keys()) raw_name_list = list(raw_name_dict.keys())
if len(raw_name_list) < 3:
return data_list, []
raw_name_as_production_name = None raw_name_as_production_name = None
for raw_name in raw_name_list: for raw_name in raw_name_list:
if raw_name.lower() in self.document_production.lower(): if self.is_production_name(raw_name):
raw_name_as_production_name = raw_name raw_name_as_production_name = raw_name
break break
datapoint_list_with_production_name = [] datapoint_list_with_production_name = []
@ -505,7 +586,7 @@ class DataExtraction:
fund_name = data_item.get("fund_name", "") fund_name = data_item.get("fund_name", "")
share_name = data_item.get("share_name", "") share_name = data_item.get("share_name", "")
raw_name = self.get_raw_name(fund_name, share_name) raw_name = self.get_raw_name(fund_name, share_name)
if raw_name.lower() in self.document_production.lower(): if self.is_production_name(raw_name):
dp_keys = [key for key in keys if key not in ["fund_name", dp_keys = [key for key in keys if key not in ["fund_name",
"share_name", "share_name",
"management_fee_and_costs", "management_fee_and_costs",
@ -557,6 +638,15 @@ class DataExtraction:
extract_data["data"].remove(remove_item) extract_data["data"].remove(remove_item)
return data_list, datapoint_list_with_production_name return data_list, datapoint_list_with_production_name
def is_production_name(self, text: str):
if text.lower() in self.document_production.lower():
return True
simlarity_util = Similarity()
similarity = simlarity_util.edit_distance_similarity(text, self.document_production)
if similarity > 0.93:
return True
return False
def remove_duplicate_data(self, data_list: list): def remove_duplicate_data(self, data_list: list):
""" """
The purpose is to remove duplicate data in the different pages. The purpose is to remove duplicate data in the different pages.
@ -629,6 +719,8 @@ class DataExtraction:
raw_name = self.get_raw_name(fund_name, share_name) raw_name = self.get_raw_name(fund_name, share_name)
if len(raw_name) == 0: if len(raw_name) == 0:
continue continue
if raw_name.lower() in ["the fund", "sample fund"]:
continue
# if isinstance(self.document_production, str) and \ # if isinstance(self.document_production, str) and \
# raw_name.lower() in self.document_production.lower(): # raw_name.lower() in self.document_production.lower():
# continue # continue
@ -636,6 +728,95 @@ class DataExtraction:
raw_name_dict[raw_name] = {"fund_name": fund_name, "share_name": share_name} raw_name_dict[raw_name] = {"fund_name": fund_name, "share_name": share_name}
return raw_name_dict return raw_name_dict
def post_management_fee_exclude_performance_fee(self, data_list: list):
adjust = False
mangement_fee_index_list = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("page_index", [])
if len(mangement_fee_index_list) == 0:
return data_list, adjust
effective_datapoint = self.special_datapoint_feature.get("management_fee_including_performance_fee", {}).\
get("datapoint", "")
if effective_datapoint == "performance_fee_costs":
mangement_fee_index_list = []
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
data = data_dict.get("extract_data", {}).get("data", [])
for data_item in data:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint == "management_fee_and_costs"]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint == "performance_fee_costs"]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if exist_effective_datapoints and not exist_exclude_datapoints:
if page_index not in mangement_fee_index_list:
mangement_fee_index_list.append(page_index)
min_page_index = min(mangement_fee_index_list)
performance_fee_item_list = []
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
if page_index <= min_page_index:
continue
extract_data = data_dict.get("extract_data", {})
data = extract_data.get("data", [])
for data_item in data:
keys = list(data_item.keys())
share_name = data_item.get("share_name", "")
if len(share_name) == 0:
continue
if "performance_fee_costs" in keys:
performance_fee_item_list.append(data_item)
for data_dict in data_list:
page_index = data_dict.get("page_index", -1)
if page_index not in mangement_fee_index_list:
continue
extract_data = data_dict.get("extract_data", {})
management_fee_data_list = extract_data.get("data", [])
for management_fee_data in management_fee_data_list:
keys = list(management_fee_data.keys())
fund_name = management_fee_data.get("fund_name", "")
share_name = management_fee_data.get("share_name", "")
if fund_name == "" or share_name == "":
continue
remain_keys = [key for key in keys if key not in ["fund_name", "share_name",
"management_fee_and_costs",
"management_fee"]]
if len(remain_keys) > 0:
continue
if "management_fee_and_costs" in keys:
management_fee_and_costs = management_fee_data.get("management_fee_and_costs", -1)
try:
management_fee_and_costs = float(management_fee_and_costs)
except:
management_fee_and_costs = -1
if management_fee_and_costs != -1:
for performance_fee_item in performance_fee_item_list:
pf_fund_name = performance_fee_item.get("fund_name", "")
pf_share_name = performance_fee_item.get("share_name", "")
if pf_fund_name == "" or pf_share_name == "":
continue
if pf_fund_name.lower() == fund_name.lower() and pf_share_name.lower() == share_name.lower():
performance_fee_costs = performance_fee_item.get("performance_fee_costs", -1)
try:
performance_fee_costs = float(performance_fee_costs)
except:
performance_fee_costs = -1
if performance_fee_costs != -1:
management_fee_data["management_fee_and_costs"] = management_fee_and_costs - performance_fee_costs
management_fee_data["management_fee"] = management_fee_data["management_fee_and_costs"]
management_fee_data["source"] = f"subtract_performance_fee_{performance_fee_costs}"
adjust = True
break
return data_list, adjust
def post_adjust_management_fee_costs(self, data_list: list): def post_adjust_management_fee_costs(self, data_list: list):
""" """
Adjust the management fee and management fee and costs Adjust the management fee and management fee and costs
@ -668,9 +849,10 @@ class DataExtraction:
for mf in management_fee_list: for mf in management_fee_list:
mf_fund_name = mf.get("fund_name", "") mf_fund_name = mf.get("fund_name", "")
mf_share_name = mf.get("share_name", "") mf_share_name = mf.get("share_name", "")
if (mf_fund_name == fund_name and mf_share_name == share_name) or \ # if (mf_fund_name == fund_name and mf_share_name == share_name) or \
(len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and # (len(mf_fund_name) > 0 and len(mf_share_name) > 0 and mf_fund_name == mf_share_name and
(mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): # (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))):
if (mf_fund_name == fund_name and mf_share_name == share_name):
if exist_complex_rule_keywords and \ if exist_complex_rule_keywords and \
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
mf["management_fee"] = management_fee mf["management_fee"] = management_fee
@ -693,9 +875,10 @@ class DataExtraction:
for mfc in management_fee_costs_list: for mfc in management_fee_costs_list:
mfc_fund_name = mfc.get("fund_name", "") mfc_fund_name = mfc.get("fund_name", "")
mfc_share_name = mfc.get("share_name", "") mfc_share_name = mfc.get("share_name", "")
if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \ # if (mfc_fund_name == fund_name and mfc_share_name == share_name) or \
(len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and # (len(mfc_fund_name) > 0 and len(mfc_share_name) > 0 and mfc_fund_name == mfc_share_name and
(mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))): # (mfc_share_name.endswith(share_name) or share_name.endswith(mfc_share_name))):
if (mfc_fund_name == fund_name and mfc_share_name == share_name):
if exist_complex_rule_keywords and \ if exist_complex_rule_keywords and \
("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys):
mfc["management_fee_and_costs"] = management_fee_costs mfc["management_fee_and_costs"] = management_fee_costs
@ -792,7 +975,7 @@ class DataExtraction:
previous_page_datapoints = [] previous_page_datapoints = []
previous_page_fund_name = None previous_page_fund_name = None
for page_num, page_text in self.page_text_dict.items(): for page_num, page_text in self.page_text_dict.items():
# if page_num not in [4, 5]: # if page_num not in [42]:
# continue # continue
if page_num in handled_page_num_list: if page_num in handled_page_num_list:
continue continue
@ -1180,9 +1363,17 @@ class DataExtraction:
except: except:
data = {"data": []} data = {"data": []}
try: try:
data = self.validate_data(extract_data_info=data, if self.doc_source == "emea_ar":
page_text=page_text, data = self.validate_emea_ar_data(extract_data_info=data,
previous_page_last_fund=previous_page_last_fund) page_text=page_text,
previous_page_last_fund=previous_page_last_fund)
elif self.doc_source == "aus_prospectus":
data = self.validate_aus_prospectus_data(extract_data_info=data,
page_text=page_text,
page_num=page_num,
previous_page_last_fund=previous_page_last_fund)
else:
pass
except: except:
pass pass
@ -1297,7 +1488,12 @@ class DataExtraction:
except: except:
data = {"data": []} data = {"data": []}
try: try:
data = self.validate_data(data, None, previous_page_last_fund) if self.doc_source == "emea_ar":
data = self.validate_emea_ar_data(data, None, previous_page_last_fund)
elif self.doc_source == "aus_prospectus":
data = self.validate_aus_prospectus_data(data, None, page_num, previous_page_last_fund)
else:
pass
except: except:
pass pass
@ -1336,7 +1532,7 @@ class DataExtraction:
# print(text) # print(text)
return text return text
def validate_data(self, def validate_emea_ar_data(self,
extract_data_info: dict, extract_data_info: dict,
page_text: str, page_text: str,
previous_page_last_fund: str=None) -> dict: previous_page_last_fund: str=None) -> dict:
@ -1348,6 +1544,7 @@ class DataExtraction:
data_list = extract_data_info.get("data", []) data_list = extract_data_info.get("data", [])
if len(data_list) == 0: if len(data_list) == 0:
return extract_data_info return extract_data_info
remove_list = [] remove_list = []
performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged" performance_fee_regex = r"Amount\s+of\s+the\s+performance\s+fees|Performance\s+Fees\s+amounts|Performance\s+fees\s+amounts|Commissioni\s+di\s+performance|Performance\s+Fee\s+|Performance\s+fees\s+charged"
nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV" nav_regex = r"based\s+on\s+(the\s+)?NAV|on\s+the\s+Share\s+Class\s+NAV|NAV\s+of\s+performance\s+fee|of\s+the\s+average\s+Net\s+Asset\s+Value|Attivi\s+in\s+gestione|Performance\s+Fee\s+of\s+NAV\s+in|share\s+class\s+dealing\s+NAV"
@ -1367,29 +1564,29 @@ class DataExtraction:
if len(keys) == 0: if len(keys) == 0:
remove_list.append(data) remove_list.append(data)
continue continue
fund_name = data.get("fund name", "").strip() raw_fund_name = data.get("fund name", "").strip()
if fund_name == "": if raw_fund_name == "":
remove_list.append(data) remove_list.append(data)
continue continue
# Clean fund name start # Clean fund name start
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0: if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
previous_page_last_fund = previous_page_last_fund.strip() previous_page_last_fund = previous_page_last_fund.strip()
if fund_name.startswith(previous_page_last_fund) and fund_name != previous_page_last_fund: if raw_fund_name.startswith(previous_page_last_fund) and raw_fund_name != previous_page_last_fund:
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip() modified_fund_name = raw_fund_name.replace(previous_page_last_fund, "").strip()
if len(modified_fund_name.split()) > 1: if len(modified_fund_name.split()) > 1:
fund_name = modified_fund_name raw_fund_name = modified_fund_name
fund_name = self.get_fund_name(fund_name, "Fund") raw_fund_name = self.get_fund_name(raw_fund_name, "Fund")
fund_name = self.get_fund_name(fund_name, "Bond") raw_fund_name = self.get_fund_name(raw_fund_name, "Bond")
remove_prefix_list = ["Market Specific Equity Sub-Funds", remove_prefix_list = ["Market Specific Equity Sub-Funds",
"International and Regional Equity Sub-Funds", "International and Regional Equity Sub-Funds",
"Equity Sub-Funds"] "Equity Sub-Funds"]
for remove_item in remove_prefix_list: for remove_item in remove_prefix_list:
if fund_name.startswith(remove_item): if raw_fund_name.startswith(remove_item):
fund_name = fund_name.replace(remove_item, "").strip() raw_fund_name = raw_fund_name.replace(remove_item, "").strip()
data["fund name"] = fund_name data["fund name"] = raw_fund_name
# Clean fund name end # Clean fund name end
keys = list(data.keys()) keys = list(data.keys())
@ -1403,11 +1600,11 @@ class DataExtraction:
if ter_search is not None: if ter_search is not None:
include_key_words = True include_key_words = True
if not include_key_words: if not include_key_words:
is_share_name = self.check_fund_name_as_share(fund_name) is_share_name = self.check_fund_name_as_share(raw_fund_name)
if not is_share_name: if not is_share_name:
remove_list.append(data) remove_list.append(data)
break break
data["share name"] = fund_name data["share name"] = raw_fund_name
if data.get(key, "") == "": if data.get(key, "") == "":
data.pop(key) data.pop(key)
for remove_data in remove_list: for remove_data in remove_list:
@ -1439,8 +1636,8 @@ class DataExtraction:
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}" multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
exist_multi_over_3_share = False exist_multi_over_3_share = False
for data in data_list: for data in data_list:
fund_name = data.get("fund name", "").strip() raw_fund_name = data.get("fund name", "").strip()
if len(fund_name) == 0: if len(raw_fund_name) == 0:
continue continue
raw_share_name = data.get("share name", "") raw_share_name = data.get("share name", "")
if not exist_multi_over_3_share: if not exist_multi_over_3_share:
@ -1454,7 +1651,7 @@ class DataExtraction:
if len(share_name_list) > 0: if len(share_name_list) > 0:
for share_name in share_name_list: for share_name in share_name_list:
new_data = {} new_data = {}
new_data["fund_name"] = fund_name new_data["fund_name"] = raw_fund_name
if share_name != "": if share_name != "":
new_data["share_name"] = share_name new_data["share_name"] = share_name
ter = data.get("ter", None) ter = data.get("ter", None)
@ -1472,6 +1669,128 @@ class DataExtraction:
extract_data_info["data"] = new_data_list extract_data_info["data"] = new_data_list
return extract_data_info return extract_data_info
def validate_aus_prospectus_data(self,
extract_data_info: dict,
page_text: str,
page_num: int,
previous_page_last_fund: str=None) -> dict:
data_list = extract_data_info.get("data", [])
if len(data_list) == 0:
return extract_data_info
remove_list = []
for data in data_list:
raw_fund_name = data.get("fund name", "").strip()
if raw_fund_name == "":
remove_list.append(data)
continue
# Clean fund name start
if previous_page_last_fund is not None and len(previous_page_last_fund) > 0:
previous_page_last_fund = previous_page_last_fund.strip()
if raw_fund_name.startswith(previous_page_last_fund) and raw_fund_name != previous_page_last_fund:
modified_fund_name = raw_fund_name.replace(previous_page_last_fund, "").strip()
if len(modified_fund_name.split()) > 1:
raw_fund_name = modified_fund_name
data["fund name"] = raw_fund_name
for remove_data in remove_list:
if remove_data in data_list:
data_list.remove(remove_data)
new_data_list = []
multi_over_3_share_regex = r"([A-Z]{1,}\,\s){3,}"
exist_multi_over_3_share = False
for data in data_list:
raw_fund_name = data.get("fund name", "").strip()
if len(raw_fund_name) == 0:
continue
raw_share_name = data.get("share name", "")
if not exist_multi_over_3_share:
multi_over_3_share_search = re.search(multi_over_3_share_regex, raw_share_name)
if multi_over_3_share_search is not None:
exist_multi_over_3_share = True
if exist_multi_over_3_share:
share_name_list = self.split_multi_share_name(raw_share_name)
else:
share_name_list = [raw_share_name]
if len(share_name_list) > 0:
for share_name in share_name_list:
new_data = {}
new_data["fund_name"] = raw_fund_name
if share_name != "":
new_data["share_name"] = share_name
for key, value in data.items():
if key not in ["fund name", "share name"]:
new_data[key] = value
new_data_list.append(new_data)
extract_data_info["data"] = new_data_list
if page_text is not None and len(page_text) > 0:
try:
self.set_datapoint_feature_properties(new_data_list, page_text, page_num)
except Exception as e:
logger.error(f"Error in setting datapoint feature properties: {e}")
return extract_data_info
def set_datapoint_feature_properties(self, data_list: list, page_text: str, page_num: int) -> None:
for feature, properties in self.special_datapoint_feature_config.items():
if self.special_datapoint_feature.get(feature, {}).get("page_index", None) is not None:
continue
provider_ids = properties.get("provider_ids", [])
if len(provider_ids) > 0:
is_current_provider = False
doc_provider_list = self.document_mapping_info_df["ProviderId"].unique().tolist()
if len(doc_provider_list) > 0:
for provider in provider_ids:
if provider in doc_provider_list:
is_current_provider = True
break
if not is_current_provider:
continue
detail_list = properties.get("details", [])
if len(detail_list) == 0:
continue
set_feature_property = False
for detail in detail_list:
regex_text_list = detail.get("regex_text", [])
if len(regex_text_list) == 0:
continue
effective_datapoints = detail.get("effective_datapoints", [])
if len(effective_datapoints) == 0:
continue
exclude_datapoints = detail.get("exclude_datapoints", [])
exist_effective_datapoints = False
exist_exclude_datapoints = False
for data_item in data_list:
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in effective_datapoints]
if len(datapoints) > 0:
exist_effective_datapoints = True
datapoints = [datapoint for datapoint in list(data_item.keys())
if datapoint in exclude_datapoints]
if len(datapoints) > 0:
exist_exclude_datapoints = True
if exist_effective_datapoints and exist_exclude_datapoints:
break
if not exist_effective_datapoints:
continue
if exist_exclude_datapoints:
continue
found_regex_text = False
for regex_text in regex_text_list:
regex_search = re.search(regex_text, page_text, re.IGNORECASE)
if regex_search is not None:
found_regex_text = True
break
if found_regex_text:
if self.special_datapoint_feature[feature].get("page_index", None) is None:
self.special_datapoint_feature[feature]["page_index"] = []
self.special_datapoint_feature[feature]["datapoint"] = effective_datapoints[0]
self.special_datapoint_feature[feature]["page_index"].append(page_num)
set_feature_property = True
if set_feature_property:
break
def split_multi_share_name(self, raw_share_name: str) -> list: def split_multi_share_name(self, raw_share_name: str) -> list:
""" """
Some document, e.g. 481482392 Some document, e.g. 481482392
@ -1597,6 +1916,7 @@ class DataExtraction:
if page_text is not None and len(page_text) > 0: if page_text is not None and len(page_text) > 0:
logger.info(f"Transfer previous page fund name: {page_text} to be the pre-fix of page text") logger.info(f"Transfer previous page fund name: {page_text} to be the pre-fix of page text")
summary += f"\nThe last fund name of previous PDF page: {page_text}\n" summary += f"\nThe last fund name of previous PDF page: {page_text}\n"
summary += "If could find the fund name for the first data point value, please ignore this fund name.\n"
else: else:
summary = self.instructions_config.get("summary", "\n") summary = self.instructions_config.get("summary", "\n")
@ -1608,7 +1928,7 @@ class DataExtraction:
instructions.extend(image_features) instructions.extend(image_features)
instructions.append("\n") instructions.append("\n")
instructions.append("Datapoints Reported name:\n") instructions.append("## Datapoints Reported name\n")
instructions.append("Please look for relevant reported names and similar variations in the context.\n") instructions.append("Please look for relevant reported names and similar variations in the context.\n")
reported_name_info_in_instructions = self.instructions_config.get("reported_name", {}) reported_name_info_in_instructions = self.instructions_config.get("reported_name", {})
for datapoint in datapoints: for datapoint in datapoints:
@ -1708,7 +2028,7 @@ class DataExtraction:
none_value_example_count += 1 none_value_example_count += 1
instructions.append("\n") instructions.append("\n")
instructions.append("Data business features:\n") instructions.append("## Data business features\n")
data_business_features = self.instructions_config.get( data_business_features = self.instructions_config.get(
"data_business_features", {} "data_business_features", {}
) )
@ -1716,7 +2036,7 @@ class DataExtraction:
instructions.append(common) instructions.append(common)
instructions.append("\n") instructions.append("\n")
instructions.append("Datapoints investment level:\n") instructions.append("## Datapoints investment level\n")
investment_level_info = data_business_features.get("investment_level", {}) investment_level_info = data_business_features.get("investment_level", {})
for datapoint in datapoints: for datapoint in datapoints:
investment_level = investment_level_info.get(datapoint, "") investment_level = investment_level_info.get(datapoint, "")
@ -1724,7 +2044,7 @@ class DataExtraction:
instructions.append("\n") instructions.append("\n")
instructions.append("\n") instructions.append("\n")
instructions.append("Datapoints value range:\n") instructions.append("## Datapoints value range\n")
data_value_range_info = data_business_features.get("data_value_range", {}) data_value_range_info = data_business_features.get("data_value_range", {})
for datapoint in datapoints: for datapoint in datapoints:
data_value_range = data_value_range_info.get(datapoint, "") data_value_range = data_value_range_info.get(datapoint, "")
@ -1738,7 +2058,13 @@ class DataExtraction:
# 2. To load it by keywords, is to avoid for simple case, the prompts are too long. # 2. To load it by keywords, is to avoid for simple case, the prompts are too long.
complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "")
with_special_rule_title = False with_special_rule_title = False
found_sub_datapoints = []
datapoint_special_rule = {}
for datapoint in datapoints: for datapoint in datapoints:
# If some complex special rule is found, and with sub datapoints,
# need not to load relevant rule again.
if datapoint in found_sub_datapoints:
continue
find_complex_special_rule = False find_complex_special_rule = False
if page_text is not None and len(page_text) > 0: if page_text is not None and len(page_text) > 0:
complex_special_rule_list = complex_special_rule.get(datapoint, []) complex_special_rule_list = complex_special_rule.get(datapoint, [])
@ -1746,29 +2072,46 @@ class DataExtraction:
complex_keywords = complex_special_rule.get("keywords", []) complex_keywords = complex_special_rule.get("keywords", [])
if len(complex_keywords) == 0: if len(complex_keywords) == 0:
continue continue
# support keywords to be pure text or regex
keywords_is_regex = complex_special_rule.get("keywords_is_regex", False)
exist_keywords = False exist_keywords = False
for special_keywords in complex_keywords: for special_keywords in complex_keywords:
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords) if keywords_is_regex:
if special_keywords in page_text or \ if re.search(special_keywords, page_text) is not None:
re.search(special_keywrods_regex, page_text) is not None: exist_keywords = True
exist_keywords = True break
break else:
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords)
if special_keywords in page_text or \
re.search(special_keywrods_regex, page_text) is not None:
exist_keywords = True
break
if exist_keywords: if exist_keywords:
complex_prompts_list = complex_special_rule.get("prompts", []) complex_prompts_list = complex_special_rule.get("prompts", [])
if len(complex_prompts_list) > 0: if len(complex_prompts_list) > 0:
if not with_special_rule_title: if not with_special_rule_title:
instructions.append("Special rule:\n") instructions.append("## Special rule\n")
with_special_rule_title = True with_special_rule_title = True
complex_prompts = "\n".join(complex_prompts_list) complex_prompts = "\n".join(complex_prompts_list)
instructions.append(complex_prompts) instructions.append(complex_prompts)
instructions.append("\n\n") instructions.append("\n\n")
find_complex_special_rule = True find_complex_special_rule = True
# If the complex special rule is found, need to find the sub datapoints
# and add them to the found_sub_datapoints list.
sub_datapoints = complex_special_rule.get("sub_datapoints", [])
if len(sub_datapoints) > 0:
found_sub_datapoints.extend(sub_datapoints)
if find_complex_special_rule: if find_complex_special_rule:
continue continue
special_rule_list = special_rule_info.get(datapoint, []) special_rule_list = special_rule_info.get(datapoint, [])
if len(special_rule_list) > 0: if len(special_rule_list) > 0:
datapoint_special_rule[datapoint] = special_rule_list
if len(list(datapoint_special_rule.keys())) > 0:
for datapoint, special_rule_list in datapoint_special_rule.items():
if datapoint in found_sub_datapoints:
continue
if not with_special_rule_title: if not with_special_rule_title:
instructions.append("Special rule:\n") instructions.append("## Special rule\n")
with_special_rule_title = True with_special_rule_title = True
special_rule = "\n".join(special_rule_list) special_rule = "\n".join(special_rule_list)
instructions.append(special_rule) instructions.append(special_rule)
@ -1776,7 +2119,7 @@ class DataExtraction:
instructions.append("\n") instructions.append("\n")
instructions.append("Special cases:\n") instructions.append("## Special cases\n")
special_cases = self.instructions_config.get("special_cases", {}) special_cases = self.instructions_config.get("special_cases", {})
special_cases_common_list = special_cases.get("common", []) special_cases_common_list = special_cases.get("common", [])
special_cases_number = 1 special_cases_number = 1
@ -1789,7 +2132,7 @@ class DataExtraction:
contents_list = special_cases_common.get("contents", []) contents_list = special_cases_common.get("contents", [])
contents = "\n".join(contents_list) contents = "\n".join(contents_list)
instructions.append(contents) instructions.append(contents)
instructions.append("\n\n") instructions.append("\n")
for datapoint in datapoints: for datapoint in datapoints:
special_case_list = special_cases.get(datapoint, []) special_case_list = special_cases.get(datapoint, [])
@ -1803,9 +2146,8 @@ class DataExtraction:
contents = "\n".join(contents_list) contents = "\n".join(contents_list)
instructions.append(contents) instructions.append(contents)
instructions.append("\n") instructions.append("\n")
instructions.append("\n")
instructions.append("Output requirement:\n") instructions.append("## Output requirement\n")
output_requirement = self.instructions_config.get("output_requirement", {}) output_requirement = self.instructions_config.get("output_requirement", {})
output_requirement_common_list = output_requirement.get("common", []) output_requirement_common_list = output_requirement.get("common", [])
instructions.append("\n".join(output_requirement_common_list)) instructions.append("\n".join(output_requirement_common_list))

View File

@ -148,9 +148,9 @@ class FilterPages:
} }
effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0] effective_date = self.document_mapping_info_df["EffectiveDate"].iloc[0]
document_type = self.document_mapping_info_df["DocumentType"].iloc[0] document_type = self.document_mapping_info_df["DocumentType"].iloc[0]
if document_type in [4, 5]: if document_type in [4, 5] or self.doc_source == "emea_ar":
document_type = "ar" document_type = "ar"
elif document_type == 1: elif document_type == 1 or self.doc_source == "aus_prospectus":
document_type = "prospectus" document_type = "prospectus"
language_id = self.document_mapping_info_df["Language"].iloc[0] language_id = self.document_mapping_info_df["Language"].iloc[0]
language = self.language_config.get(language_id, None) language = self.language_config.get(language_id, None)

View File

@ -16,9 +16,9 @@
], ],
"data_business_features": { "data_business_features": {
"common": [ "common": [
"General rules:", "## General rules",
"- 1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", "1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.",
"- 2. Fund name: ", "2. Fund name: ",
"a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.",
"b. The sub-fund name may be as the first column or first row values in the table.", "b. The sub-fund name may be as the first column or first row values in the table.",
"b.1 fund name example:", "b.1 fund name example:",
@ -48,13 +48,32 @@
"---Example End---", "---Example End---",
"Correct fund name: MLC Horizon 2 Income Portfolio", "Correct fund name: MLC Horizon 2 Income Portfolio",
"Correct share name: MLC Horizon 2 Income Portfolio", "Correct share name: MLC Horizon 2 Income Portfolio",
"f. In table header, \"Retirement account\" or \"Account-based pension\" means \"Pension\"; ",
"\"Transition to Retirement account\" or \"Pre-retirement pension\" means \"TTR\". ",
"Please append them to the fund name and share name.",
"f.1 Example 1",
"---Example 1 Start---",
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n",
"---Example 1 End---",
"The prefix is \"Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Retirement account Cash\".",
"f.2 Example 2",
"---Example 2 Start---",
"Transition to Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCash 0.05%0.00% 0.00% 0.05%\n",
"---Example 2 End---",
"The prefix is \"Transition to Retirement account\", the investment option is \"Cash\", so fund name and share name should be: \"Transition to Retirement account Cash\".",
"f.3 Example 3",
"---Example 3 Start---",
"Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Funds reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6",
"---Example 3 End---",
"Although exist \"Retirement account\" and \"Transition to Retirement account\", but the investment option is not exist, so fund name and share name should be: \"Rest Pension\".",
"\n", "\n",
"- 3. Only extract the latest data from context:", "3. Only extract the latest data from context:",
"If with multiple data values in same row, please extract the latest.", "If with multiple data values in same row, please extract the latest.",
"\n", "\n",
"- 4. Reported names:", "4. Reported names:",
"**IGNORE THE TABLE WHICH TABLE HEADER IS WITH REPORTED NAME: \"Cost of product\"!!!**",
"Only output the values which with significant reported names.", "Only output the values which with significant reported names.",
"- Multiple data columns with same reported name but different post-fix:", "Multiple data columns with same reported name but different post-fix:",
"If there are multiple reported names with different post-fix text, here is the priority rule:", "If there are multiple reported names with different post-fix text, here is the priority rule:",
"The pos-fix text is in the brackets: (gross), (net), pick up the values from (net).", "The pos-fix text is in the brackets: (gross), (net), pick up the values from (net).",
"---Example Start---", "---Example Start---",
@ -62,8 +81,17 @@
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:", "5. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\".",
"6. Identify the value of data point and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0",
"---Example Start---",
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced Indexed 0.00% 0.00% 0.00% 0.00%\n",
"---Example End---",
"For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.",
"The fund name prefix is \"Retirement account\", the investment option is \"Balanced - Indexed\", so fund name and share name should be: \"Retirement account Balanced - Indexed\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"Retirement account Balanced - Indexed\", \"share name\": \"Retirement account Balanced - Indexed\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"performance_fee_costs\": 0}]}",
"7. If for data point value specifically Nil is written in the value then return NULL('') for the same"
], ],
"investment_level": { "investment_level": {
"total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.",
@ -95,7 +123,7 @@
"total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00", "total_annual_dollar_based_charges": "Total annual dollar based charges is belong to decimal number, the value could be more than 100, e.g. 625.00",
"management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.", "management_fee_and_costs": "Management fee and costs is belong to percentage number, the value should be less than 100.",
"management_fee": "Management fee is belong to percentage number, the value should be less than 100.", "management_fee": "Management fee is belong to percentage number, the value should be less than 100.",
"performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 100.", "performance_fee_costs": "Performance fees costs is belong to percentage number, the value should be less than 10.",
"buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.", "buy_spread": "Buy spread is belong to percentage number, the value should be less than 100.",
"sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.", "sell_spread": "Sell spread is belong to percentage number, the value should be less than 100.",
"establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.", "establishment_fee": "Establishment fee is belong to percentage number, the value should be less than 100.",
@ -120,8 +148,9 @@
}, },
"special_rule": { "special_rule": {
"management_fee_and_costs": [ "management_fee_and_costs": [
"### Management fee and cost",
"Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.", "Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.",
"If there are multiple Management fee and costs reported names, here is the priority rule:", "A. If there are multiple Management fee and costs reported names, here are the priority rules:",
"A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", "A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".",
"---Example 1 Start---", "---Example 1 Start---",
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
@ -172,7 +201,7 @@
"The management_fee is the value of \"Management fee (% pa)\".", "The management_fee is the value of \"Management fee (% pa)\".",
"The management_fee_and_costs is the value of \"Total management cost (% pa)\".", "The management_fee_and_costs is the value of \"Total management cost (% pa)\".",
"---Example 1 Start---", "---Example 1 Start---",
"Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.00 0.04 0.00 0.01 1.38 0.31\n1.29 0.00 0.00 0.00 0.01 1.30 0.29\n",
"---Example 1 End---", "---Example 1 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}",
@ -201,6 +230,7 @@
"---Example 3 Start---", "---Example 3 Start---",
"Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.200.50 5 \n\n", "Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.200.50 5 \n\n",
"---Example 3 End---", "---Example 3 End---",
"For value: 1.04%(g)/2.18%(n), (g) means gross, (n) means net, please extract net value: 2.18",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}", "{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}",
"\n", "\n",
@ -211,7 +241,8 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}", "{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
"\n", "\n",
"F. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "F. If columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\" appear, please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\" for EACH SPECIFIC investment option. ",
"DO NOT assume these values apply to other investment options mentioned elsewhere in the context or from provided examples.",
"---Example 1 Start---", "---Example 1 Start---",
"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n", "\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
"---Example 1 End---", "---Example 1 End---",
@ -251,85 +282,102 @@
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:", "Both of management_fee and management_fee_and_costs are the values for \"Management costs\", so the output should be:",
"{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}", "{\"data\": [{\"fund name\": \"FirstChoice Wholesale Defensive\", \"share name\": \"FirstChoice Wholesale Defensive\", \"management_fee_and_costs\": 0.85, \"management_fee\": 0.85}, {\"fund name\": \"FirstChoice Wholesale Conservative\", \"share name\": \"FirstChoice Wholesale Conservative\", \"management_fee_and_costs\": 0.9, \"management_fee\": 0.9, \"performance_fee_costs\": 0.02}]}",
"---Example 2 Start---", "---Example 2 Start---",
"Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nCapital Stable 0.46% 0.04% 0.08% 0.54%\nBalanced 0.52% 0.06% 0.10%0.62% \n",
"---Example 2 End",
"The column: \"(A) Investment fees and costs (including (B) performance fees) (pa)*\" includes \"(B) performance fees) (pa)*\", we should subtract the \"(B) performance fees) (pa)*\" value, just output the pure management fee and costs value.",
"Besides, the \"Retirement account\" is the pre-fix fund name, should output it with fund/ share name together, e.g. \"Retirement account Capital Stable\"",
"The output should be:",
"{\"data\": [{\"fund name\": \"Retirement account Capital Stable\", \"share name\": \"Retirement account Capital Stable\", \"management_fee_and_costs\": 0.42, \"management_fee\": 0.42, \"performance_fee_costs\": 0.04}, {\"fund name\": \"Retirement account Balanced\", \"share name\": \"Retirement account Balanced\", \"management_fee_and_costs\": 0.46, \"management_fee\": 0.46, \"performance_fee_costs\": 0.06}]}",
"---Example 3 Start---",
"Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n", "Investment \noption \nInvestment fees and \ncosts (p.a.) \n1 \nTransaction \ncosts (p.a.) \nMySuper/ \nBalanced \n0.38% (including 0.09% \nPerformance fee) \n0.18% \nManaged \nGrowth \n0.38% (including 0.11% \nPerformance fee) \n0.08% \n",
"---Example 2 End---", "---Example 3 End---",
"The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.", "The column: \"Investment fees and costs (p.a.)\", \"including Performance fee\", meaning the value is the sum of \"Management costs\" and \"performance fee\", We should subtract the \"performance fee\" value, just output the \"Management costs\" value.",
"Both of management_fee and management_fee_and_costs are the values for \"Management costs\".", "Both of management_fee and management_fee_and_costs are the values for \"Management costs\".",
"So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.", "So, for fund: MySuper/Balanced, the value 0.38, including 0.09 Performance fee, so the Management costs is 0.38 - 0.09 = 0.29, performance_fee_costs is 0.09.",
"For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.", "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.",
"So the output should be:", "So the output should be:",
"{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}", "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}",
"---Example 3 Start---",
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
"---Example 3 End---",
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
"If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.",
"If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.",
"So the output should be:",
"{\"data\": [{\"fund name\": \"CFS Real Return Class A\", \"share name\": \"CFS Real Return Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
"\n", "\n",
"I. Some table is very complex, with many data points columns, please extract the relevant values.", "I. If exist **\"Maximum management fee\"** in context, please ignore relevant values.",
"---Example 1 Start---",
"Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n",
"---Example 1 End---",
"For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ",
"\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ",
"\"Performance fee (p.a.)\" as performance_fee_costs, ",
"\"Buy/sell spread (%)\" as buy_spread and sell_spread.",
"If one row has 5 decimal numbers, ",
"the 2nd decimal number is the administration_fees, ",
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
"the 4th decimal number is the performance_fee_costs, ",
"the 5th decimal number is the buy_spread and sell_spread.",
"If one row has 4 decimal numbers, ",
"the 2nd decimal number is the administration_fees, ",
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
"the 4th decimal number is the buy_spread and sell_spread.",
"Please always ignore the 1st decimal number, we need not the total sum values.",
"The output should be:",
"{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
"J. If exist **\"Maximum management fee\"** in context, please ignore relevant values.",
"---Example Start---", "---Example Start---",
"Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%", "Fund name \nMaximum \nmanagement \nfee (p.a.) \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity Income Fund 3.075% \nAUSTRALIAN SHARE \nFirst Sentier Wholesale Australian Share Fund 1.538%",
"---Example End---", "---Example End---",
"The values in example is **Maximum management fee**, should ignore all of them.", "The values in example is **Maximum management fee**, should ignore all of them.",
"The Output should be:", "The Output should be:",
"{\"data\": []}" "{\"data\": []}",
"J. The management fee and costs in paragraph with speficic fund/ share prefix name: \"Account-based pension\" or \"Pre-retirement pension\"",
"---Example 1 Start---",
"Account-based pension \nInvestment fees \nand costs 2 \nHigh Growth 0.45%, Growth 0.49%",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Account-based pension High Growth\", \"share name\": \"Account-based pension High Growth\", \"management_fee_and_costs\": 0.45, \"management_fee\": 0.45}, {\"fund name\": \"Account-based pension Growth\", \"share name\": \"Account-based pension Growth\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]}",
"---Example 2 Start---",
"Pre-retirement pension \nWe generally calculate \nand deduct this fee daily when unit \nprices are determined. \nHigh Growth 0.48%, Growth 0.50%",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}",
"K. DO NOT extract management fees from \"Cost of product\" summaries. ",
"\"Cost of product\" figures should not be treated as 'Investment fees and costs'.",
"---Example Start---",
"Investment option Cost of product \nCash $141.00",
"---Example End---",
"FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!! JUST RETURN EMPTY RESPONSE!!!",
"The output should be:",
"{\"data\": []}",
"L. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option.",
"M. Identify the value of management fee and costs, and if it is written 0% or 0.00% or 0 or 0.00, then extract the same as 0, please don't ignore it."
], ],
"administration_fees":[ "administration_fees":[
"### Administration fees and costs",
"Administration fees and costs and total annual dollar-based charges are share class level data.", "Administration fees and costs and total annual dollar-based charges are share class level data.",
"Simple case:", "Simple case:",
"----Example 1 Start----", "----Example 1 Start----",
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n", "Fees and costs summary \n\nVision income streams \n\nType of fee Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n2 \n0.25% pa of your account balance (made up of \n0.25% of your account balance which is capped \nat $1,050 pa plus a reserving margin of 0.00% \npa of each investment options assets).",
"----Example 1 End----", "----Example 1 End----",
"According to example, the administration fee is 0.25% pa, so administration_fees is 0.25, ",
"The output should be:",
"{\"data\": [{\"fund name\": \"Vision income streams\", \"share name\": \"Vision income streams\", \"administration_fees\": 0.25}]}",
"\n",
"----Example 2 Start----",
"Fees and costs summary \n\nLegalsuper Pension \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \n1 \nAdministration fees and \ncosts \n$67.60 pa ($1.30 per week) plus 0.29% pa \nof your account balance \n",
"----Example 2 End----",
"According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ", "According to example, the administration fee is $1.30 per week plus 0.29% pa, so administration_fees is 0.29, ",
"total_annual_dollar_based_charges is 1.30 * 52 = 67.6", "total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}", "{\"data\": [{\"fund name\": \"Legalsuper Pension\", \"share name\": \"Legalsuper Pension\", \"administration_fees\": 0.29, \"total_annual_dollar_based_charges\": 67.6}]}",
"\n", "\n",
"----Example 2 Start----", "----Example 3 Start----",
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply)", "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply)",
"----Example 2 End----", "----Example 3 End----",
"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ", "According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
"total_annual_dollar_based_charges is 1 * 52 = 52", "total_annual_dollar_based_charges is 1 * 52 = 52",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}", "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
"---Example 3 Start---", "\n",
"---Example 4 Start---",
"\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n", "\nPrime Super Income Stream\nType of fee \nor cost \nAmount How and when paid \nOngoing annual fees and costs \n1 \nAdministration \nfees and costs \nAdministration \nfees of $1.30 \nper week \nPlus \n0.50% p.a. of \nyour account \nbalance, capped \nat $500 p.a. \nDeducted from your \naccount on the last \nbusiness day of each \nmonth, except if you \nare leaving Prime \nSuper, in which case \nit is deducted prior to \nyour exit from Prime \nSuper. \nInvestment \nfees and costs \n2 \n0.07% to 1.00% \nof assets p.a. \ndepending on \nthe investment \noption \nTaken into account \nprior to the declaration \nof weekly earning \nrates. This cost is not \ndeducted directly from \nyour account. \n",
"---Example 3 End---", "---Example 4 End---",
"According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ", "According to example, the administration fee is $1.30 per week plus 0.50% p.a., so administration_fees is 0.5, ",
"total_annual_dollar_based_charges is 1.30 * 52 = 67.6", "total_annual_dollar_based_charges is 1.30 * 52 = 67.6",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}", "{\"data\": [{\"fund name\": \"Prime Super Income Stream\", \"share name\": \"Prime Super Income Stream\", \"administration_fees\": 0.5, \"total_annual_dollar_based_charges\": 67.6}]}",
"---Example 4 Start---", "\n",
"---Example 5 Start---",
"At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the Fees and \nother costs section on \npages 40-46 for details \n", "At a glance summary \n\nImportant information about TelstraSuper RetireAccess income streams \n\nTTR income stream Retirement income stream Reference \nAdministration fee • \n• \n$1.00 per week plus 0.17% pa - if you have more than one account the $1.00 per \nweek fee will only apply to one account \nA fee rebate applies if your balance exceeds $1m, or if your and your spouses \ncombined account balances exceed $969,410 (conditions apply) \nRefer to the Fees and \nother costs section on \npages 40-46 for details \n",
"---Example 4 End---", "---Example 5 End---",
"According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ", "According to example, the administration fee is $1.00 per week plus 0.17% pa, so administration_fees is 0.17, ",
"total_annual_dollar_based_charges is 1 * 52 = 52", "total_annual_dollar_based_charges is 1 * 52 = 52",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}", "{\"data\": [{\"fund name\": \"TelstraSuper RetireAccess\", \"share name\": \"TelstraSuper RetireAccess\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 52}]}",
"---Example 6 Start---",
"Administration \nfees and costs \n1 \nFirstChoice Lifestage (MySuper product) \nand Select investment options \n(other than FirstRate Saver) \n0.04% p.a. \nThe percentagebased administration fee is reflected in \nthe daily unit price of your investment option and payable \nmonthly or as incurred by the option. \nFirstRate Saver \nFrom 0.35% to \n0.50% p.a. \nThe dollarbased administration fee of $5 per month is \npayable at the beginning of each month by deduction of \nunits from one of your options. \nDollar-based fee discounts \nThe current fee for FirstRate Saver is set out at \ncfs.com.au/personal/resources/funds-and-performance/ \nfirstrateinterestrates.html \nYour employer may be able to negotiate a lower dollar \nbased administration fee for employee members. \nplus \nDollar-based administration fee \nRetained benefit and spouse members are not entitled \nto this discount. \n$60 p.a. ($5 per month) per account \n",
"---Example 6 Start---",
"According to example, the administration fee is 0.04, ",
"\"From 0.35% to 0.50% p.a.\", because it is the range value, need ignore and exclude, so administration_fees is 0.04, ",
"the total_annual_dollar_based_charges is 60 (5 per month * 12)",
"About fund name, it should be \"FirstChoice Lifestage\".",
"The output should be:",
"{\"data\": [{\"fund name\": \"FirstChoice Lifestage\", \"share name\": \"FirstChoice Lifestage\", \"administration_fees\": 0.04, \"total_annual_dollar_based_charges\": 60}]}",
"\n", "\n",
"Complex cases:", "Complex cases:",
"A. Need to add multiple numbers together.", "A. Need to add multiple numbers together.",
@ -341,11 +389,34 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}", "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"administration_fees\": 0.32}]}",
"---Example 2 Start---", "---Example 2 Start---",
"Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Funds \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", "Mine Super\nType of fee or cost Amount (% pa) How and when paid \nOngoing annual fees and costs \n1 \nWe generally calculate and \ndeduct this fee daily when unit \nprices are determined. \nAdministration fees \nand costs \n0.16% pa \nPlus \n0.031% pa. \n",
"---Example 2 End---", "---Example 2 End---",
"Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees." "According to example, the relevant values: 0.16% and 0.031%, so administration_fees is 0.16 + 0.031 = 0.191",
"The output should be:",
"{\"data\": [{\"fund name\": \"Mine Super\", \"share name\": \"Mine Super\", \"administration_fees\": 0.191}]}",
"---Example 3 Start---",
"Fees and costs* \n\nRetirement account Transition to Retirement account \nAdministration fees (taken directly \nfrom your account) \n$1.50 per week plus 0.10% pa of your account balance on the day the fee \nis deducted (0.10% pa component is capped at $300 pa). \nAdministration costs (not taken \ndirectly from your account) \nThis is deducted from the Funds reserves throughout the year, not your account. \n0.09% pa (based on costs for the financial year ended 30 June 2024). \n\n\nRest Pension Product Disclosure Statement \n\n6",
"---Example 3 End---",
"According to the example, the administration fee is $1.50 per week plus 0.10% pa, Administration costs is 0.09% pa so administration_fees is 0.1 + 0.09 = 0.19, ",
"total_annual_dollar_based_charges is 1.50 * 52 = 78",
"The output should be:",
"{\"data\": [{\"fund name\": \"Rest Pension\", \"share name\": \"Rest Pension\", \"administration_fees\": 0.19, \"total_annual_dollar_based_charges\": 78}]}",
"---Example 4 Start---",
"Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Funds \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n",
"---Example 4 End---",
"Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees, only output total_annual_dollar_based_charges as 78.",
"B. The administration fee and costs/ total annual dollar-based charges are with production name, other data points/ values are with specific fund/ share name(s).",
"---Example Start---",
"My Super \nType of fee or cost Amount How and when paid \nOngoing annual fees and costs 1 \nAdministration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a \nmaximum of $1,000 p.a.) \n$0.50 per week deducted from your account\nbalance at the end of each month or on exit.\nPercentage fee taken into account in the \ndaily calculation of unit prices. \nInvestment fees and costs \n2 \nOption % of options assets* \nFund1 0.12%\n",
"---Example End---",
"According to example, \"My Super\" is with \"Administration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a maximum of $1,000 p.a.) \n$0.50 per week deducted from your account balance at the end of each month or on exit.\"",
"so administration_fees is 0.17, total_annual_dollar_based_charges is 0.50 * 52 = 26, with production name: \"My Super\".",
"\"Fund1\" is with specific fund/ share name, so management_fee_and_costs and management_fee are: 0.12",
"The output should be:",
"{\"data\": [{\"fund name\": \"My Super\", \"share name\": \"My Super\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 26}, {\"fund name\": \"Fund1\", \"share name\": \"Fund1\", \"management_fee_and_costs\": 0.12, \"management_fee\": 0.12}]}"
], ],
"total_annual_dollar_based_charges": [ "total_annual_dollar_based_charges": [
"### Total annual dollar-based charges",
"Total annual dollar-based charges are share class level data.", "Total annual dollar-based charges are share class level data.",
"A. Its value corresponds to the administration fees and costs that are charged on a weekly basis.", "A. Its value corresponds to the administration fees and costs that are charged on a weekly basis.",
"----Example Start----", "----Example Start----",
@ -357,22 +428,31 @@
"{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", "{\"data\": [{\"fund name\": \"MLC MasterKey Super & Pension Fundamentals\", \"share name\": \"MLC MasterKey Super & Pension Fundamentals\", \"total_annual_dollar_based_charges\": 78}, {\"fund name\": \"MLC Horizon 4 Balanced Portfolio\", \"share name\": \"MLC Horizon 4 Balanced Portfolio\", \"management_fee_and_costs\": 1.2, \"management_fee\": 1.2, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
"\n", "\n",
"B. Please identify some case which not belong to the total_annual_dollar_based_charges, and output empty.", "B. Please identify some case which not belong to the total_annual_dollar_based_charges, and output empty.",
"----Example Start----", "----Example 1 Start----",
"Cost of product information \n\nCost of product for 1 year \n\nThe cost of product gives a summary calculation about \nhow ongoing annual fees and costs can affect your \nsuperannuation investment over a 1-year period for all \ninvestment options. It is calculated in the manner \nshown in the 'Example of annual fees and costs'. \n\nThe cost of product information assumes a balance of \n$50,000 at the beginning of the year. (Additional fees \nsuch as a buy/sell spread may apply refer to the Fees \nand costs summary table for the relevant investment \noption.) \n\nYou should use this figure to help compare \nsuperannuation products and investment options. \n\nInvestment option \nCash \nCost of product \nPerpetual Cash \n$60.00 \nFixed income and credit \nBentham Global Income \n$485.00 \n", "Cost of product information \n\nCost of product for 1 year \n\nThe cost of product gives a summary calculation about \nhow ongoing annual fees and costs can affect your \nsuperannuation investment over a 1-year period for all \ninvestment options. It is calculated in the manner \nshown in the 'Example of annual fees and costs'. \n\nThe cost of product information assumes a balance of \n$50,000 at the beginning of the year. (Additional fees \nsuch as a buy/sell spread may apply refer to the Fees \nand costs summary table for the relevant investment \noption.) \n\nYou should use this figure to help compare \nsuperannuation products and investment options. \n\nInvestment option \nCash \nCost of product \nPerpetual Cash \n$60.00 \nFixed income and credit \nBentham Global Income \n$485.00 \n",
"----Example End----", "----Example 1 End----",
"Explanation:", "Explanation:",
"The values provided in the example are not total annual dollar-based charges; ", "The values provided in the example are not total annual dollar-based charges; ",
"they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ", "they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ",
"This figure includes ongoing annual fees and costs, but it may not encompass all possible charges, such as additional fees like buy/sell spreads. ", "This figure includes ongoing annual fees and costs, but it may not encompass all possible charges, such as additional fees like buy/sell spreads. ",
"Therefore, it serves as a comparative tool rather than a comprehensive total of all annual charges.", "Therefore, it serves as a comparative tool rather than a comprehensive total of all annual charges.",
"The output should be empty:", "The output should be empty:",
"{\"data\": []}" "{\"data\": []}",
"----Example 2 Start----",
"Equals \nCost of product \n1 \nIf your balance was $50,000 at \nthe beginning of the year, then \nfor that year you will be charged \nfees and costs of $395 for the \nsuperannuation product. \n\n",
"----Example 2 End----",
"Explanation:",
"The values provided in the example are not total annual dollar-based charges; ",
"they represent the cost of product information, which is a calculated figure used to compare superannuation products and investment options. ",
"FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!!"
], ],
"buy_spread": [ "buy_spread": [
"### Buy/sell spread",
"Buy/sell spread is share class level data.",
"A. Exclude reported name", "A. Exclude reported name",
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ", "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ",
"Estimated transaction costs offset by buy/sell spreads (% pa), ", "Estimated transaction costs offset by buy/sell spreads (% pa), Transaction costs",
"---Example 1 Start---", "---Example 1 Start---",
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n", "Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
"---Example 1 End---", "---Example 1 End---",
@ -390,6 +470,13 @@
"---Example 3 End---", "---Example 3 End---",
"The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:", "The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:",
"{\"data\": []}", "{\"data\": []}",
"\n",
"---Example 4 Start---",
"Transaction costs \nOption % of options assets* \nHigh Growth 0.03% \nTaken into account in the daily calculation\nof unit prices\nMember activity related fees and costs \nBuy-sell spread Nil N/A\nSwitching fee Nil N/A\n",
"---Example 4 End---",
"According to example, please exclude Transaction costs.",
"\"Buy-sell spread\" data section is under \"Member activity related fees and costs\", the value is Nil, output for buy_spread and sell_spread should be:",
"{\"data\": []}",
"B. Simple case with simple table structure:", "B. Simple case with simple table structure:",
"---Example 1 Start---", "---Example 1 Start---",
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n", "Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
@ -413,9 +500,16 @@
"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n", "\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
"---Example 4 End---", "---Example 4 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund Class A\", \"share name\": \"Allan Gray Australian Equity Fund Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}" "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund Class A\", \"share name\": \"Allan Gray Australian Equity Fund Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}",
"\n",
"---Example 5 Start---",
"Fees and costs \n\nFund name \nManagement fees \nand costs (p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nBaillie Gifford Sustainable \nGrowth Fund Class A \n0.88% 0.10%\nBaillie Gifford Long Term \nGlobal Growth Fund Class A \n0.96% 0.05%\n\n",
"---Example 5 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Baillie Gifford Sustainable Growth Fund Class A\", \"share name\": \"Baillie Gifford Sustainable Growth Fund Class A\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.88, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Baillie Gifford Long Term Global Growth Fund Class A\", \"share name\": \"Baillie Gifford Long Term Global Growth Fund Class A\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.05, \"sell_spread\": 0.05}]}"
], ],
"performance_fee_costs": [ "performance_fee_costs": [
"### Performance fees",
"Performance fees is share class level data.", "Performance fees is share class level data.",
"A. If the performance fees is with the range, please ignore and output empty.", "A. If the performance fees is with the range, please ignore and output empty.",
"---Example Start---", "---Example Start---",
@ -436,7 +530,7 @@
"a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.", "a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.",
"b. This example mentioned share classes, please output according to share class.", "b. This example mentioned share classes, please output according to share class.",
"The output should be", "The output should be",
"{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}",
"D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", "D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0",
"---Example Start---", "---Example Start---",
"Fund/Investment Option \nManagement Fees \nand Costs \n(% pa) \n1 \nPerformance Fees 2 \n(% pa) \nTransaction Costs 3 \n(% pa) \nBT American Share Fund 1.08 0.00 0.00\nBT Asian Share Fund 1.10 0.00 0.10", "Fund/Investment Option \nManagement Fees \nand Costs \n(% pa) \n1 \nPerformance Fees 2 \n(% pa) \nTransaction Costs 3 \n(% pa) \nBT American Share Fund 1.08 0.00 0.00\nBT Asian Share Fund 1.10 0.00 0.10",
@ -454,6 +548,7 @@
"a. For this example, you have Example keyword in the header so you should not extract any datapoint values Like performance_fee_costs, management fee etc." "a. For this example, you have Example keyword in the header so you should not extract any datapoint values Like performance_fee_costs, management fee etc."
], ],
"minimum_initial_investment": [ "minimum_initial_investment": [
"### Minimum initial investment",
"Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.",
"---Example 1 Start---", "---Example 1 Start---",
"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan", "The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
@ -488,6 +583,7 @@
"{\"data\": [{\"fund name\": \"Lifeplan Investment Bond\", \"minimum_initial_investment\": 1000}]}" "{\"data\": [{\"fund name\": \"Lifeplan Investment Bond\", \"minimum_initial_investment\": 1000}]}"
], ],
"benchmark_name": [ "benchmark_name": [
"### Benchmark name",
"Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ", "Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ",
"Sometime, there are multiple benchmark names with weightings in the context, please extract them all including weightings and benchmark names.", "Sometime, there are multiple benchmark names with weightings in the context, please extract them all including weightings and benchmark names.",
"A. Examples for single benchmark name", "A. Examples for single benchmark name",
@ -605,7 +701,10 @@
"management_fee_and_costs": [ "management_fee_and_costs": [
{ {
"keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"], "keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"],
"prompts": ["Complex management fee and costs rule:", "keywords_is_regex": false,
"sub_datapoints": ["administration_fees", "performance_fee_costs"],
"prompts": [
"### Complex management fee and costs rule",
"If the table with columns:", "If the table with columns:",
"\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"", "\"Administration fees\", \"Investment fees\" ,\"Estimated other investment costs\" and \"Estimated performance fees\"",
"The administration_fees is \"Administration fees\"", "The administration_fees is \"Administration fees\"",
@ -626,7 +725,10 @@
}, },
{ {
"keywords": ["Entry Fee option \nNil Entry option"], "keywords": ["Entry Fee option \nNil Entry option"],
"prompts": ["Complex management fee and costs rule:", "keywords_is_regex": false,
"sub_datapoints": ["performance_fee_costs"],
"prompts": [
"### Complex management fee and costs rule",
"If the table with columns:", "If the table with columns:",
"\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"", "\"Entry Fee option\", \"Nil Entry option\", \"Estimated Other investment costs\", \"Estimated Performance fees\"",
"The performance_fee_costs is \"Estimated Performance fees\"", "The performance_fee_costs is \"Estimated Performance fees\"",
@ -637,21 +739,36 @@
"---Example 1 Start---", "---Example 1 Start---",
"\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n", "\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
"---Example 1 End---", "---Example 1 End---",
"The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.",
"Please pay attention below information", "Please pay attention below information",
"Assume the numeric column sequence number is from 1.", "Assume the numeric column sequence is from 1.",
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.", "\"Entry Fee option\" values are as the 1st column values, \"Nil Entry option\" values are as the 2nd column values, \"Estimated other investment costs\" values are as the 3rd column values, \"Estimated Performance fees\" values are as the 4th column values.",
"For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ", "Here is the example to get data, step by step.",
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", "For this fund in Example:",
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", "Platinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n",
"Step 1 Get new fund name",
"Combine \"Platinum Asia\" with \"Entry Fee\" as \"Platinum Asia Entry Fee\"",
"Combine \"Platinum Asia\" with \"Nil Entry\" as \"Platinum Asia Nil Entry\"",
"Step 2 **EXCLUE the values of the last three columns of data.**",
"ONLY KEEP these 4 values: 2.14 2.99 0.02 0.00 for next steps",
"Step 3 Calculate management_fee and management_fee_and_costs for these 2 new funds:",
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (Value of 1st column) + 0.02 (Value of 3rd column)",
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (Value of 2nd column) + 0.02 (Value of 3rd column)",
"**Make sure don't take \"Estimated other investment costs\" value from the wrong column!!!**",
"Step 4 Get performance_fee_costs",
"the fund: Platinum Asia Entry Fee, performance_fee_costs is 0 (Value of 4th column)",
"the fund: Platinum Asia Nil Entry, performance_fee_costs is 0 (Value of 4th column)",
"Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0", "Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0",
"**Make sure don't take \"Estimated Performance fees\" value from the wrong column!!!**",
"Please ignore the last fund name of previous PDF page, and extract data as these 4 steps for all of records in Context.",
"Therefore, the output should be:", "Therefore, the output should be:",
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}" "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}"
] ]
}, },
{ {
"keywords": ["Retirement and TTR income streams"], "keywords": ["Retirement and TTR income streams"],
"prompts": ["Complex management fee and costs rule:", "keywords_is_regex": false,
"prompts": [
"### Complex management fee and costs rule",
"For management_fee_and_costs, ", "For management_fee_and_costs, ",
"a. If the title is \"Retirement and TTR income streams\"", "a. If the title is \"Retirement and TTR income streams\"",
"it means each investment name is with two fund names, one is for Retirement as pension, another is for TTR.", "it means each investment name is with two fund names, one is for Retirement as pension, another is for TTR.",
@ -672,7 +789,10 @@
}, },
{ {
"keywords": ["Recoverable expenses \nEstimated other indirect costs"], "keywords": ["Recoverable expenses \nEstimated other indirect costs"],
"prompts": ["Complex management fee and costs rule:", "keywords_is_regex": false,
"sub_datapoints": ["performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread"],
"prompts": [
"### Complex management fee and costs rule",
"If the table with columns:", "If the table with columns:",
"\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"", "\"Management fee (% pa)\", \"Recoverable expenses\", \"Estimated other indirect costs\", \"Peformance fees charged to the Investment Option by underlying managers\", \"Performance fees charged by interposed vehicles\", \"Buy/sell spreads\"",
"The management_fee is \"Management fee (% pa)\".", "The management_fee is \"Management fee (% pa)\".",
@ -714,8 +834,10 @@
}, },
{ {
"keywords":["Plus other investment fees and costs \nEquals investment fees and costs"], "keywords":["Plus other investment fees and costs \nEquals investment fees and costs"],
"keywords_is_regex": false,
"sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"],
"prompts": [ "prompts": [
"Complex management fee and costs rule:", "### Complex management fee and costs rule",
"If the table with columns:", "If the table with columns:",
"\"Performance fee\", \"Plus other investment fees and costs\", \"Equals investment fees and costs\", \"Transaction costs(net)\", \"Buy-sell spreads\", \"Transaction costs(gross)\".", "\"Performance fee\", \"Plus other investment fees and costs\", \"Equals investment fees and costs\", \"Transaction costs(net)\", \"Buy-sell spreads\", \"Transaction costs(gross)\".",
"Both of the management_fee and management_fee_costs are \"Plus other investment fees and costs\".", "Both of the management_fee and management_fee_costs are \"Plus other investment fees and costs\".",
@ -730,6 +852,52 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" "{\"data\": [{\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Super & Pension pre-retirement phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"MLC Inflation Plus Conservative Portfolio\", \"share name\": \"Retirement Phase\", \"performance_fee_costs\": 0.18, \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}"
] ]
},
{
"keywords":["Total\\s*administration\\s*and (management|investment)\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"],
"keywords_is_regex": true,
"sub_datapoints": ["administration_fees", "performance_fee_costs", "buy_spread", "sell_spread"],
"prompts": [
"### Complex management fee and costs rule",
"---Example Start---",
"Option name \nTotal administration\nand investment\nfees and costs (p.a.)\n= \nAdministration\nfees and\ncosts (p.a.)\n+ \nInvestment fees \nand costs (p.a.) \n2 \n+ \nPerformance \nfee (p.a.) \n1 \nBuy/sell\nspread\n(%)\n6 \nCFS Multi-Manager Multi-Sector (These investment options are located in the Investment Options Menu.) \nCFS Defensive \n0.94% \n0.20% 0.74%0.15 \nCFS Conservative 1.04% \n1 \n0.20% 0.81% 0.03%\n1 \n0.15 \n",
"---Example End---",
"For this table, there are \"Administration fees and costs (p.a.)\" as administration_fees, ",
"\"Investment fees and costs (p.a.)\" as management_fee_and_costs and management_fee, ",
"\"Performance fee (p.a.)\" as performance_fee_costs, ",
"\"Buy/sell spread (%)\" as buy_spread and sell_spread.",
"If one row has 5 decimal numbers, ",
"the 2nd decimal number is the administration_fees, ",
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
"the 4th decimal number is the performance_fee_costs, ",
"the 5th decimal number is the buy_spread and sell_spread.",
"If one row has 4 decimal numbers, ",
"the 2nd decimal number is the administration_fees, ",
"the 3rd decimal number is the management_fee_and_costs and management_fee, ",
"the 4th decimal number is the buy_spread and sell_spread.",
"\"Buy/sell spread\" is always as the last decimal value column, for buy_spread and sell_spread, please extract all of them.",
"Please always ignore the 1st decimal number, we need not the total sum values.",
"The output should be:",
"{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
]
},
{
"keywords":["Total\\s*of\\s*(management|investment)\\s*fees\\s*and\\s*costs\\s*and\\s*performance\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"],
"keywords_is_regex": true,
"sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"],
"prompts": [
"### Complex management fee and costs rule",
"---Example Start---",
"Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n",
"---Example End---",
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
"If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.",
"If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.",
"So the output should be:",
"{\"data\": [{\"fund name\": \"CFS Real Return Class A\", \"share name\": \"CFS Real Return Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}"
]
} }
] ]
} }
@ -782,19 +950,12 @@
}, },
"output_requirement": { "output_requirement": {
"common": [ "common": [
"If possible, please extract fund name, share name, data points values as the output.",
"If find fund name, and exist sub fund name, please output fund name + sub fund name, e.g. fund name is \"Black Rock European\", sub fund name is \"Growth\", the output fund name should be: \"Black Rock European Growth\".", "If find fund name, and exist sub fund name, please output fund name + sub fund name, e.g. fund name is \"Black Rock European\", sub fund name is \"Growth\", the output fund name should be: \"Black Rock European Growth\".",
"Only output the data point which with relevant value.", "Only output the data point which with relevant value.",
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
"Please also output the data point reported name in context.", "Please also output the data point reported name in context.",
"Example:",
"---Example Start---",
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\nAlphinity Sustainable Share Fund\n0.95\n0.60\n0.42\n1.55\n1.37\nAntipodes Global Fund\n1.20\n0.60\n0.42\n1.80\n1.62\n",
"---Example End---",
"Output:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"management_fee_and_costs\": 1.37, \"management_fee\": 0.95, \"administration_fees\": 0.42}, {\"fund name\": \"Antipodes Global Fund\", \"share name\": \"Antipodes Global Fund\", \"management_fee_and_costs\": 1.62, \"management_fee\": 1.20, \"administration_fees\": 0.42}]}",
"Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.", "Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.",
"The output should be JSON format, the format is like below example(s):" "The output should be JSON format, the format is like below example(s):"
], ],
@ -876,7 +1037,8 @@
}, },
"end": [ "end": [
"Only output JSON data.", "Only output JSON data.",
"Don't output the value which not exist in context.", "Please re-check before output answer, DO NOT output the data point and value which not exist in context.",
"DO NOT use the example values from a representative fund (such as Balanced Growth) for other funds unless explicitly mentioned",
"If can't find fund name or share class name in context, please output empty JSON data: {\"data\": []}" "If can't find fund name or share class name in context, please output empty JSON data: {\"data\": []}"
] ]
} }

21
main.py
View File

@ -453,7 +453,6 @@ def batch_start_job(
pdf_folder: str = "/data/emea_ar/pdf/", pdf_folder: str = "/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
doc_data_excel_file: str = None, doc_data_excel_file: str = None,
document_mapping_file: str = None,
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/", output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
@ -1051,7 +1050,6 @@ def batch_run_documents(
doc_source: str = "emea_ar", doc_source: str = "emea_ar",
special_doc_id_list: list = None, special_doc_id_list: list = None,
pdf_folder: str = r"/data/emea_ar/pdf/", pdf_folder: str = r"/data/emea_ar/pdf/",
document_mapping_file: str = None,
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", output_extract_data_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/", output_extract_data_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
@ -1090,7 +1088,6 @@ def batch_run_documents(
pdf_folder, pdf_folder,
output_pdf_text_folder, output_pdf_text_folder,
page_filter_ground_truth_file, page_filter_ground_truth_file,
document_mapping_file,
output_extract_data_child_folder, output_extract_data_child_folder,
output_mapping_child_folder, output_mapping_child_folder,
output_extract_data_total_folder, output_extract_data_total_folder,
@ -1110,7 +1107,6 @@ def batch_run_documents(
pdf_folder, pdf_folder,
output_pdf_text_folder, output_pdf_text_folder,
page_filter_ground_truth_file, page_filter_ground_truth_file,
document_mapping_file,
output_extract_data_child_folder, output_extract_data_child_folder,
output_mapping_child_folder, output_mapping_child_folder,
output_extract_data_total_folder, output_extract_data_total_folder,
@ -1452,7 +1448,7 @@ def get_aus_prospectus_document_category():
def test_post_adjust_extract_data(): def test_post_adjust_extract_data():
doc_id = "397107472" doc_id = "480854121"
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1532,13 +1528,21 @@ if __name__ == "__main__":
doc_source = "aus_prospectus" doc_source = "aus_prospectus"
# doc_source = "emea_ar" # doc_source = "emea_ar"
if doc_source == "aus_prospectus": if doc_source == "aus_prospectus":
# document_sample_file = (
# r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt"
# )
document_sample_file = ( document_sample_file = (
r"./sample_documents/aus_prospectus_46_documents_sample.txt" r"./sample_documents/aus_prospectus_46_documents_sample.txt"
) )
# document_sample_file = (
# r"./sample_documents/aus_prospectus_87_vision_cfs_documents_sample.txt"
# )
logger.info(f"Start to run document sample file: {document_sample_file}")
with open(document_sample_file, "r", encoding="utf-8") as f: with open(document_sample_file, "r", encoding="utf-8") as f:
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()
document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" if len(doc_id.strip()) > 0]
# special_doc_id_list = ["441280757"] # special_doc_id_list = ["470879332", "462780211", "561929947", "422100350"]
# special_doc_id_list = ["462780211", "539999907"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (
@ -1559,7 +1563,6 @@ if __name__ == "__main__":
doc_source=doc_source, doc_source=doc_source,
special_doc_id_list=special_doc_id_list, special_doc_id_list=special_doc_id_list,
pdf_folder=pdf_folder, pdf_folder=pdf_folder,
document_mapping_file=document_mapping_file,
output_pdf_text_folder=output_pdf_text_folder, output_pdf_text_folder=output_pdf_text_folder,
output_extract_data_child_folder=output_extract_data_child_folder, output_extract_data_child_folder=output_extract_data_child_folder,
output_extract_data_total_folder=output_extract_data_total_folder, output_extract_data_total_folder=output_extract_data_total_folder,

459
mini_main.py Normal file
View File

@ -0,0 +1,459 @@
import os
import json
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
import time
import fitz
import re
from io import BytesIO
from traceback import print_exc
from utils.logger import logger
from utils.pdf_download import download_pdf_from_documents_warehouse
from utils.sql_query_util import query_document_fund_mapping
from utils.pdf_util import PDFUtil
from utils.biz_utils import add_slash_to_text_as_regex
from core.page_filter import FilterPages
from core.data_extraction import DataExtraction
from core.data_mapping import DataMapping
from core.auz_nz.hybrid_solution_script import api_for_fund_matching_call
from core.metrics import Metrics
import certifi
class EMEA_AR_Parsing:
def __init__(
self,
doc_id: str,
doc_source: str = "emea_ar",
pdf_folder: str = r"/data/emea_ar/pdf/",
output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/",
output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
extract_way: str = "text",
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
compare_with_provider: bool = True
) -> None:
self.doc_id = doc_id
self.doc_source = doc_source
self.pdf_folder = pdf_folder
os.makedirs(self.pdf_folder, exist_ok=True)
self.compare_with_provider = compare_with_provider
self.pdf_file = self.download_pdf()
self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False)
if extract_way is None or len(extract_way) == 0:
extract_way = "text"
self.extract_way = extract_way
self.output_extract_image_folder = None
if self.extract_way == "image":
self.output_extract_image_folder = (
r"/data/emea_ar/output/extract_data/images/"
)
os.makedirs(self.output_extract_image_folder, exist_ok=True)
if output_extract_data_folder is None or len(output_extract_data_folder) == 0:
output_extract_data_folder = r"/data/emea_ar/output/extract_data/docs/"
if not output_extract_data_folder.endswith("/"):
output_extract_data_folder = f"{output_extract_data_folder}/"
if extract_way is not None and len(extract_way) > 0:
output_extract_data_folder = (
f"{output_extract_data_folder}by_{extract_way}/"
)
self.output_extract_data_folder = output_extract_data_folder
os.makedirs(self.output_extract_data_folder, exist_ok=True)
if output_mapping_data_folder is None or len(output_mapping_data_folder) == 0:
output_mapping_data_folder = r"/data/emea_ar/output/mapping_data/docs/"
if not output_mapping_data_folder.endswith("/"):
output_mapping_data_folder = f"{output_mapping_data_folder}/"
if extract_way is not None and len(extract_way) > 0:
output_mapping_data_folder = (
f"{output_mapping_data_folder}by_{extract_way}/"
)
self.output_mapping_data_folder = output_mapping_data_folder
os.makedirs(self.output_mapping_data_folder, exist_ok=True)
self.filter_pages = FilterPages(
self.doc_id,
self.pdf_file,
self.document_mapping_info_df,
self.doc_source,
output_pdf_text_folder,
)
self.page_text_dict = self.filter_pages.page_text_dict
self.datapoint_page_info, self.result_details = self.get_datapoint_page_info()
self.datapoints = self.get_datapoints_from_datapoint_page_info()
if drilldown_folder is None or len(drilldown_folder) == 0:
drilldown_folder = r"/data/emea_ar/output/drilldown/"
os.makedirs(drilldown_folder, exist_ok=True)
self.drilldown_folder = drilldown_folder
misc_config_file = os.path.join(
f"./configuration/{doc_source}/", "misc_config.json"
)
if os.path.exists(misc_config_file):
with open(misc_config_file, "r", encoding="utf-8") as f:
misc_config = json.load(f)
self.apply_drilldown = misc_config.get("apply_drilldown", False)
else:
self.apply_drilldown = False
def download_pdf(self) -> str:
pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id)
return pdf_file
def get_datapoint_page_info(self) -> tuple:
datapoint_page_info, result_details = self.filter_pages.start_job()
return datapoint_page_info, result_details
def get_datapoints_from_datapoint_page_info(self) -> list:
datapoints = list(self.datapoint_page_info.keys())
if "doc_id" in datapoints:
datapoints.remove("doc_id")
return datapoints
def extract_data(
self,
re_run: bool = False,
) -> list:
found_data = False
if not re_run:
output_data_json_folder = os.path.join(
self.output_extract_data_folder, "json/"
)
os.makedirs(output_data_json_folder, exist_ok=True)
json_file = os.path.join(output_data_json_folder, f"{self.doc_id}.json")
if os.path.exists(json_file):
logger.info(
f"The document: {self.doc_id} has been parsed, loading data from {json_file}"
)
with open(json_file, "r", encoding="utf-8") as f:
data_from_gpt = json.load(f)
found_data = True
if not found_data:
try:
data_extraction = DataExtraction(
self.doc_source,
self.doc_id,
self.pdf_file,
self.output_extract_data_folder,
self.page_text_dict,
self.datapoint_page_info,
self.datapoints,
self.document_mapping_info_df,
extract_way=self.extract_way,
output_image_folder=self.output_extract_image_folder,
)
data_from_gpt = data_extraction.extract_data()
except Exception as e:
logger.error(f"Error: {e}")
print_exc()
data_from_gpt = {"data": []}
# Drilldown data to relevant PDF document
annotation_list = []
if self.apply_drilldown:
try:
annotation_list = self.drilldown_pdf_document(data_from_gpt)
except Exception as e:
logger.error(f"Error: {e}")
return data_from_gpt, annotation_list
def drilldown_pdf_document(self, data_from_gpt: list) -> list:
logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}")
pdf_util = PDFUtil(self.pdf_file)
drilldown_data_list = []
for data in data_from_gpt:
doc_id = str(data.get("doc_id", ""))
page_index = data.get("page_index", -1)
if page_index == -1:
continue
extract_data_list = data.get("extract_data", {}).get("data", [])
dp_reported_name_dict = data.get("extract_data", {}).get(
"dp_reported_name", {}
)
highlighted_value_list = []
for extract_data in extract_data_list:
for data_point, value in extract_data.items():
if value in highlighted_value_list:
continue
if data_point in ["ter", "ogc", "performance_fee"]:
continue
drilldown_data = {
"doc_id": doc_id,
"page_index": page_index,
"data_point": data_point,
"parent_text_block": None,
"value": value,
"annotation_attribute": {},
}
drilldown_data_list.append(drilldown_data)
highlighted_value_list.append(value)
for data_point, reported_name in dp_reported_name_dict.items():
if reported_name in highlighted_value_list:
continue
data_point = f"{data_point}_reported_name"
drilldown_data = {
"doc_id": doc_id,
"page_index": page_index,
"data_point": data_point,
"parent_text_block": None,
"value": reported_name,
"annotation_attribute": {},
}
drilldown_data_list.append(drilldown_data)
highlighted_value_list.append(reported_name)
drilldown_result = pdf_util.batch_drilldown(
drilldown_data_list=drilldown_data_list,
output_pdf_folder=self.drilldown_folder,
)
annotation_list = []
if len(drilldown_result) > 0:
logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully")
annotation_list = drilldown_result.get("annotation_list", [])
for annotation in annotation_list:
annotation["doc_id"] = doc_id
if self.drilldown_folder is not None and len(self.drilldown_folder) > 0:
drilldown_data_folder = os.path.join(self.drilldown_folder, "data/")
os.makedirs(drilldown_data_folder, exist_ok=True)
drilldown_file = os.path.join(
drilldown_data_folder, f"{doc_id}_drilldown.xlsx"
)
drilldown_source_df = pd.DataFrame(drilldown_data_list)
annotation_list_df = pd.DataFrame(annotation_list)
# set drilldown_result_df column order as doc_id, pdf_file, page_index,
# data_point, value, matching_val_area, normalized_bbox
try:
annotation_list_df = annotation_list_df[
[
"doc_id",
"pdf_file",
"page_index",
"data_point",
"value",
"matching_val_area",
"normalized_bbox",
]
]
except Exception as e:
logger.error(f"Error: {e}")
logger.info(f"Writing drilldown data to {drilldown_file}")
try:
with pd.ExcelWriter(drilldown_file) as writer:
drilldown_source_df.to_excel(
writer, index=False, sheet_name="source_data"
)
annotation_list_df.to_excel(
writer, index=False, sheet_name="drilldown_data"
)
except Exception as e:
logger.error(f"Error: {e}")
annotation_list = annotation_list_df.to_dict(orient="records")
try:
drilldown_json_file = os.path.join(
drilldown_data_folder, f"{doc_id}_drilldown.json"
)
with open(drilldown_json_file, "w", encoding="utf-8") as f:
json.dump(annotation_list, f, ensure_ascii=False, indent=4)
except Exception as e:
logger.error(f"Error: {e}")
return annotation_list
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
if not re_run:
output_data_json_folder = os.path.join(
self.output_mapping_data_folder, "json/"
)
os.makedirs(output_data_json_folder, exist_ok=True)
json_file = os.path.join(output_data_json_folder, f"{self.doc_id}.json")
if os.path.exists(json_file):
logger.info(
f"The fund/ share of this document: {self.doc_id} has been mapped, loading data from {json_file}"
)
with open(json_file, "r", encoding="utf-8") as f:
doc_mapping_data = json.load(f)
if self.doc_source == "aus_prospectus":
output_data_folder_splits = output_data_json_folder.split("output")
if len(output_data_folder_splits) == 2:
merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/'
os.makedirs(merged_data_folder, exist_ok=True)
merged_data_json_folder = os.path.join(merged_data_folder, "json/")
os.makedirs(merged_data_json_folder, exist_ok=True)
merged_data_excel_folder = os.path.join(merged_data_folder, "excel/")
os.makedirs(merged_data_excel_folder, exist_ok=True)
merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json")
if os.path.exists(merged_data_file):
with open(merged_data_file, "r", encoding="utf-8") as f:
merged_data_list = json.load(f)
return merged_data_list
else:
data_mapping = DataMapping(
self.doc_id,
self.datapoints,
data_from_gpt,
self.document_mapping_info_df,
self.output_mapping_data_folder,
self.doc_source,
compare_with_provider=self.compare_with_provider
)
merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data,
merged_data_json_folder,
merged_data_excel_folder)
return merged_data_list
else:
return doc_mapping_data
"""
doc_id,
datapoints: list,
raw_document_data_list: list,
document_mapping_info_df: pd.DataFrame,
output_data_folder: str,
"""
data_mapping = DataMapping(
self.doc_id,
self.datapoints,
data_from_gpt,
self.document_mapping_info_df,
self.output_mapping_data_folder,
self.doc_source,
compare_with_provider=self.compare_with_provider
)
return data_mapping.mapping_raw_data_entrance()
def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None:
logger.info(f"Filter EMEA AR PDF pages for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(
doc_id, doc_source=doc_source, pdf_folder=pdf_folder
)
datapoint_page_info, result_details = emea_ar_parsing.get_datapoint_page_info()
return datapoint_page_info, result_details
def extract_data(
doc_id: str,
doc_source: str,
pdf_folder: str,
output_data_folder: str,
extract_way: str = "text",
re_run: bool = False,
) -> None:
logger.info(f"Extract EMEA AR data for doc_id: {doc_id}")
emea_ar_parsing = EMEA_AR_Parsing(
doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_extract_data_folder=output_data_folder,
extract_way=extract_way,
)
data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run)
return data_from_gpt, annotation_list
def batch_extract_data(
pdf_folder: str,
doc_source: str = "emea_ar",
output_child_folder: str = r"/data/emea_ar/output/extract_data/docs/",
output_total_folder: str = r"/data/emea_ar/output/extract_data/total/",
extract_way: str = "text",
special_doc_id_list: list = None,
re_run: bool = False,
) -> None:
pdf_files = glob(pdf_folder + "*.pdf")
doc_list = []
if special_doc_id_list is not None and len(special_doc_id_list) > 0:
doc_list = special_doc_id_list
if len(doc_list) == 0:
logger.info(f"No special doc_id list provided, extracting all documents in {pdf_folder}")
return
result_list = []
for pdf_file in tqdm(pdf_files):
pdf_base_name = os.path.basename(pdf_file)
doc_id = pdf_base_name.split(".")[0]
if doc_list is not None and doc_id not in doc_list:
continue
data_from_gpt = extract_data(
doc_id=doc_id,
doc_source=doc_source,
pdf_folder=pdf_folder,
output_data_folder=output_child_folder,
extract_way=extract_way,
re_run=re_run,
)
result_list.extend(data_from_gpt)
if special_doc_id_list is None or len(special_doc_id_list) == 0:
result_df = pd.DataFrame(result_list)
result_df.reset_index(drop=True, inplace=True)
logger.info(f"Saving the result to {output_total_folder}")
os.makedirs(output_total_folder, exist_ok=True)
time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
output_file = os.path.join(
output_total_folder,
f"extract_data_info_{len(pdf_files)}_documents_{time_stamp}.xlsx",
)
with pd.ExcelWriter(output_file) as writer:
result_df.to_excel(writer, index=False, sheet_name="extract_data_info")
def test_translate_pdf():
from core.data_translate import Translate_PDF
pdf_file = r"/data/emea_ar/pdf/451063582.pdf"
output_folder = r"/data/translate/output/"
translate_pdf = Translate_PDF(pdf_file, output_folder)
translate_pdf.start_job()
if __name__ == "__main__":
os.environ["SSL_CERT_FILE"] = certifi.where()
doc_source = "aus_prospectus"
re_run = True
extract_way = "text"
if doc_source == "aus_prospectus":
special_doc_id_list = ["539266874"]
pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_child_folder: str = (
r"/data/aus_prospectus/output/extract_data/docs/"
)
output_total_folder: str = (
r"/data/aus_prospectus/output/extract_data/total/"
)
elif doc_source == "emea_ar":
special_doc_id_list = ["514636993"]
pdf_folder: str = r"/data/emea_ar/pdf/"
output_child_folder: str = (
r"/data/emea_ar/output/extract_data/docs/"
)
output_total_folder: str = (
r"/data/emea_ar/output/extract_data/total/"
)
else:
raise ValueError(f"Invalid doc_source: {doc_source}")
batch_extract_data(
pdf_folder=pdf_folder,
doc_source=doc_source,
output_child_folder=output_child_folder,
output_total_folder=output_total_folder,
extract_way=extract_way,
special_doc_id_list=special_doc_id_list,
re_run=re_run,
)

File diff suppressed because one or more lines are too long

View File

@ -1483,13 +1483,21 @@ def set_mapping_to_data_side_documents_data():
# mapping_sheet = "document_mapping" # mapping_sheet = "document_mapping"
# output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx" # output_file_path = r"/data/aus_prospectus/output/ravi_100_documents/audited_file_phase2_with_mapping.xlsx"
data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx" # data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth.xlsx"
# data_sheet = "ground_truth"
# raw_name_column = "raw_share_name"
# mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx"
# mapping_sheet = "document_mapping"
# raw_name_mapping_column = None
# output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
data_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth.xlsx"
data_sheet = "ground_truth" data_sheet = "ground_truth"
raw_name_column = "raw_share_name" raw_name_column = "raw_share_name"
mapping_file_path = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" mapping_file_path = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx"
mapping_sheet = "document_mapping" mapping_sheet = "document_mapping"
raw_name_mapping_column = None raw_name_mapping_column = None
output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" output_file_path = r"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx"
set_mapping_to_raw_name_data(data_file_path=data_file_path, set_mapping_to_raw_name_data(data_file_path=data_file_path,
data_sheet=data_sheet, data_sheet=data_sheet,
raw_name_column=raw_name_column, raw_name_column=raw_name_column,
@ -1582,8 +1590,7 @@ def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/ou
"administration_fees", "administration_fees",
"minimum_initial_investment", "minimum_initial_investment",
"benchmark_name", "benchmark_name",
"performance_fee", "performance_fee_costs",
"performance_fee_charged",
"buy_spread", "buy_spread",
"sell_spread", "sell_spread",
"total_annual_dollar_based_charges", "total_annual_dollar_based_charges",
@ -1593,9 +1600,7 @@ def set_mapping_to_raw_name_data(data_file_path: str = r"/data/aus_prospectus/ou
"withdrawal_fee", "withdrawal_fee",
"exit_fee", "exit_fee",
"switching_fee", "switching_fee",
"activity_fee", "activity_fee"
"hurdle_rate",
"analyst_name"
]] ]]
except Exception as e: except Exception as e:
print(e) print(e)
@ -1733,7 +1738,7 @@ def update_data_by_latest_ground_truth():
if __name__ == "__main__": if __name__ == "__main__":
update_data_by_latest_ground_truth() # update_data_by_latest_ground_truth()
# set_provider_to_ground_truth( # set_provider_to_ground_truth(
# groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx", # groud_truth_file=r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx",
# ground_truth_sheet="Sheet1", # ground_truth_sheet="Sheet1",
@ -1741,7 +1746,7 @@ if __name__ == "__main__":
# document_mapping_sheet="document_mapping" # document_mapping_sheet="document_mapping"
# ) # )
# set_mapping_to_data_side_documents_data() set_mapping_to_data_side_documents_data()
# source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx" # source_file = r"/data/aus_prospectus/ground_truth/phase2_file/17_documents/audited_file_phase2_with_mapping.xlsx"
# target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" # target_file = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"

View File

@ -0,0 +1,87 @@
430229604
430249980
434533711
448576798
448576868
448576914
448576924
448577874
448577877
448578148
448701586
448906715
448906720
448906722
448907811
451234748
454947973
454947982
454948291
454948296
455232983
455235248
462770987
470958290
470958296
478920274
478946988
479996914
479996918
480713037
480726184
480726185
480854103
480854105
480854113
480854115
480854118
480854120
480854121
480854129
481877313
484628699
484628701
484628702
484628703
495516375
495547519
500579230
506913190
509581748
520698753
520702746
520703007
521591949
521606716
521606755
523516443
525464665
528208796
534933875
539999907
539999916
540028470
542294088
544886057
548035617
550533961
550769189
552727485
555377021
556527310
557362550
557526104
557526108
557526111
557526129
557526130
557526143
557526145
562753667
562753673
562754590
570781265
572302455
572302463
573372424
577949367

View File

@ -0,0 +1,3 @@
539999907
455235248
448576924

View File

@ -0,0 +1,6 @@
553449169
539791362
573372424
448906722
462780211
563608192

77
test_k_shape.py Normal file
View File

@ -0,0 +1,77 @@
import pandas as pd
import numpy as np
import sys
import os
# 添加项目路径
sys.path.append('crypto_quant')
from crypto_quant.core.biz.metrics_calculation import MetricsCalculation
def test_k_shape():
# 创建测试数据
test_data = pd.DataFrame({
'open': [9.3030000000],
'high': [9.3030000000],
'low': [9.3020000000],
'close': [9.3020000000]
})
print("测试数据:")
print(test_data)
print()
# 计算基本特征
test_data['high_low_diff'] = test_data['high'] - test_data['low']
test_data['open_close_diff'] = abs(test_data['close'] - test_data['open'])
test_data['open_close_fill'] = test_data['open_close_diff'] / test_data['high_low_diff']
test_data['price_range_ratio'] = test_data['high_low_diff'] / test_data['close'] * 100
print("计算的特征:")
print(f"high_low_diff: {test_data['high_low_diff'].iloc[0]}")
print(f"open_close_diff: {test_data['open_close_diff'].iloc[0]}")
print(f"open_close_fill: {test_data['open_close_fill'].iloc[0]}")
print(f"price_range_ratio: {test_data['price_range_ratio'].iloc[0]}%")
print()
# 检查"一字"条件
price_range_ratio = test_data['price_range_ratio'].iloc[0]
open_close_fill = test_data['open_close_fill'].iloc[0]
print("条件检查:")
print(f"price_range_ratio < 0.01: {price_range_ratio < 0.01}")
print(f"open_close_fill > 0.9: {open_close_fill > 0.9}")
print()
# 使用MetricsCalculation类
mc = MetricsCalculation()
# 为了测试我们需要创建一个有足够数据的DataFrame
# 复制测试数据多次以创建滚动窗口
extended_data = pd.concat([test_data] * 25, ignore_index=True)
# 运行set_k_shape函数
result = mc.set_k_shape(extended_data.copy())
print("分类结果:")
print(f"k_shape: {result['k_shape'].iloc[0]}")
print()
# 详细分析为什么没有被分类为"一字"
print("详细分析:")
print(f"价格范围比例: {price_range_ratio:.6f}%")
print(f"实体占比: {open_close_fill:.6f}")
print()
if price_range_ratio < 0.01:
print("✓ 满足价格范围比例 < 0.01% 的条件")
else:
print(f"✗ 不满足价格范围比例 < 0.01% 的条件 (实际: {price_range_ratio:.6f}%)")
if open_close_fill > 0.9:
print("✓ 满足实体占比 > 0.9 的条件")
else:
print(f"✗ 不满足实体占比 > 0.9 的条件 (实际: {open_close_fill:.6f})")
if __name__ == "__main__":
test_k_shape()