diff --git a/.gitignore b/.gitignore index 80557e4..ed31de2 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ /performance.ipynb /sample_documents/special_cases.txt /aus-prospectus/ +/output/log/*.log diff --git a/mini_main.py b/mini_main.py new file mode 100644 index 0000000..a078573 --- /dev/null +++ b/mini_main.py @@ -0,0 +1,459 @@ +import os +import json +import numpy as np +import pandas as pd +from glob import glob +from tqdm import tqdm +import time +import fitz +import re +from io import BytesIO +from traceback import print_exc +from utils.logger import logger +from utils.pdf_download import download_pdf_from_documents_warehouse +from utils.sql_query_util import query_document_fund_mapping +from utils.pdf_util import PDFUtil +from utils.biz_utils import add_slash_to_text_as_regex +from core.page_filter import FilterPages +from core.data_extraction import DataExtraction +from core.data_mapping import DataMapping +from core.auz_nz.hybrid_solution_script import api_for_fund_matching_call +from core.metrics import Metrics +import certifi + +class EMEA_AR_Parsing: + def __init__( + self, + doc_id: str, + doc_source: str = "emea_ar", + pdf_folder: str = r"/data/emea_ar/pdf/", + output_pdf_text_folder: str = r"/data/emea_ar/output/pdf_text/", + output_extract_data_folder: str = r"/data/emea_ar/output/extract_data/docs/", + output_mapping_data_folder: str = r"/data/emea_ar/output/mapping_data/docs/", + extract_way: str = "text", + drilldown_folder: str = r"/data/emea_ar/output/drilldown/", + compare_with_provider: bool = True + ) -> None: + self.doc_id = doc_id + self.doc_source = doc_source + self.pdf_folder = pdf_folder + os.makedirs(self.pdf_folder, exist_ok=True) + self.compare_with_provider = compare_with_provider + + self.pdf_file = self.download_pdf() + self.document_mapping_info_df = query_document_fund_mapping(doc_id, rerun=False) + + if extract_way is None or len(extract_way) == 0: + extract_way = "text" + self.extract_way = extract_way + self.output_extract_image_folder = None + if self.extract_way == "image": + self.output_extract_image_folder = ( + r"/data/emea_ar/output/extract_data/images/" + ) + os.makedirs(self.output_extract_image_folder, exist_ok=True) + + if output_extract_data_folder is None or len(output_extract_data_folder) == 0: + output_extract_data_folder = r"/data/emea_ar/output/extract_data/docs/" + if not output_extract_data_folder.endswith("/"): + output_extract_data_folder = f"{output_extract_data_folder}/" + if extract_way is not None and len(extract_way) > 0: + output_extract_data_folder = ( + f"{output_extract_data_folder}by_{extract_way}/" + ) + self.output_extract_data_folder = output_extract_data_folder + os.makedirs(self.output_extract_data_folder, exist_ok=True) + + if output_mapping_data_folder is None or len(output_mapping_data_folder) == 0: + output_mapping_data_folder = r"/data/emea_ar/output/mapping_data/docs/" + if not output_mapping_data_folder.endswith("/"): + output_mapping_data_folder = f"{output_mapping_data_folder}/" + if extract_way is not None and len(extract_way) > 0: + output_mapping_data_folder = ( + f"{output_mapping_data_folder}by_{extract_way}/" + ) + self.output_mapping_data_folder = output_mapping_data_folder + os.makedirs(self.output_mapping_data_folder, exist_ok=True) + + self.filter_pages = FilterPages( + self.doc_id, + self.pdf_file, + self.document_mapping_info_df, + self.doc_source, + output_pdf_text_folder, + ) + self.page_text_dict = self.filter_pages.page_text_dict + + self.datapoint_page_info, self.result_details = self.get_datapoint_page_info() + self.datapoints = self.get_datapoints_from_datapoint_page_info() + + if drilldown_folder is None or len(drilldown_folder) == 0: + drilldown_folder = r"/data/emea_ar/output/drilldown/" + os.makedirs(drilldown_folder, exist_ok=True) + self.drilldown_folder = drilldown_folder + misc_config_file = os.path.join( + f"./configuration/{doc_source}/", "misc_config.json" + ) + if os.path.exists(misc_config_file): + with open(misc_config_file, "r", encoding="utf-8") as f: + misc_config = json.load(f) + self.apply_drilldown = misc_config.get("apply_drilldown", False) + else: + self.apply_drilldown = False + + def download_pdf(self) -> str: + pdf_file = download_pdf_from_documents_warehouse(self.pdf_folder, self.doc_id) + return pdf_file + + def get_datapoint_page_info(self) -> tuple: + datapoint_page_info, result_details = self.filter_pages.start_job() + return datapoint_page_info, result_details + + def get_datapoints_from_datapoint_page_info(self) -> list: + datapoints = list(self.datapoint_page_info.keys()) + if "doc_id" in datapoints: + datapoints.remove("doc_id") + return datapoints + + def extract_data( + self, + re_run: bool = False, + ) -> list: + found_data = False + if not re_run: + output_data_json_folder = os.path.join( + self.output_extract_data_folder, "json/" + ) + os.makedirs(output_data_json_folder, exist_ok=True) + json_file = os.path.join(output_data_json_folder, f"{self.doc_id}.json") + if os.path.exists(json_file): + logger.info( + f"The document: {self.doc_id} has been parsed, loading data from {json_file}" + ) + with open(json_file, "r", encoding="utf-8") as f: + data_from_gpt = json.load(f) + found_data = True + + if not found_data: + try: + data_extraction = DataExtraction( + self.doc_source, + self.doc_id, + self.pdf_file, + self.output_extract_data_folder, + self.page_text_dict, + self.datapoint_page_info, + self.datapoints, + self.document_mapping_info_df, + extract_way=self.extract_way, + output_image_folder=self.output_extract_image_folder, + ) + data_from_gpt = data_extraction.extract_data() + except Exception as e: + logger.error(f"Error: {e}") + print_exc() + data_from_gpt = {"data": []} + + # Drilldown data to relevant PDF document + annotation_list = [] + if self.apply_drilldown: + try: + annotation_list = self.drilldown_pdf_document(data_from_gpt) + except Exception as e: + logger.error(f"Error: {e}") + return data_from_gpt, annotation_list + + def drilldown_pdf_document(self, data_from_gpt: list) -> list: + logger.info(f"Drilldown PDF document for doc_id: {self.doc_id}") + pdf_util = PDFUtil(self.pdf_file) + drilldown_data_list = [] + for data in data_from_gpt: + doc_id = str(data.get("doc_id", "")) + page_index = data.get("page_index", -1) + if page_index == -1: + continue + extract_data_list = data.get("extract_data", {}).get("data", []) + dp_reported_name_dict = data.get("extract_data", {}).get( + "dp_reported_name", {} + ) + highlighted_value_list = [] + for extract_data in extract_data_list: + for data_point, value in extract_data.items(): + if value in highlighted_value_list: + continue + if data_point in ["ter", "ogc", "performance_fee"]: + continue + drilldown_data = { + "doc_id": doc_id, + "page_index": page_index, + "data_point": data_point, + "parent_text_block": None, + "value": value, + "annotation_attribute": {}, + } + drilldown_data_list.append(drilldown_data) + highlighted_value_list.append(value) + + for data_point, reported_name in dp_reported_name_dict.items(): + if reported_name in highlighted_value_list: + continue + data_point = f"{data_point}_reported_name" + drilldown_data = { + "doc_id": doc_id, + "page_index": page_index, + "data_point": data_point, + "parent_text_block": None, + "value": reported_name, + "annotation_attribute": {}, + } + drilldown_data_list.append(drilldown_data) + highlighted_value_list.append(reported_name) + + drilldown_result = pdf_util.batch_drilldown( + drilldown_data_list=drilldown_data_list, + output_pdf_folder=self.drilldown_folder, + ) + annotation_list = [] + if len(drilldown_result) > 0: + logger.info(f"Drilldown PDF document for doc_id: {doc_id} successfully") + annotation_list = drilldown_result.get("annotation_list", []) + for annotation in annotation_list: + annotation["doc_id"] = doc_id + if self.drilldown_folder is not None and len(self.drilldown_folder) > 0: + drilldown_data_folder = os.path.join(self.drilldown_folder, "data/") + os.makedirs(drilldown_data_folder, exist_ok=True) + drilldown_file = os.path.join( + drilldown_data_folder, f"{doc_id}_drilldown.xlsx" + ) + + drilldown_source_df = pd.DataFrame(drilldown_data_list) + annotation_list_df = pd.DataFrame(annotation_list) + # set drilldown_result_df column order as doc_id, pdf_file, page_index, + # data_point, value, matching_val_area, normalized_bbox + try: + annotation_list_df = annotation_list_df[ + [ + "doc_id", + "pdf_file", + "page_index", + "data_point", + "value", + "matching_val_area", + "normalized_bbox", + ] + ] + except Exception as e: + logger.error(f"Error: {e}") + logger.info(f"Writing drilldown data to {drilldown_file}") + try: + with pd.ExcelWriter(drilldown_file) as writer: + drilldown_source_df.to_excel( + writer, index=False, sheet_name="source_data" + ) + annotation_list_df.to_excel( + writer, index=False, sheet_name="drilldown_data" + ) + except Exception as e: + logger.error(f"Error: {e}") + annotation_list = annotation_list_df.to_dict(orient="records") + try: + drilldown_json_file = os.path.join( + drilldown_data_folder, f"{doc_id}_drilldown.json" + ) + with open(drilldown_json_file, "w", encoding="utf-8") as f: + json.dump(annotation_list, f, ensure_ascii=False, indent=4) + except Exception as e: + logger.error(f"Error: {e}") + return annotation_list + + def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: + if not re_run: + output_data_json_folder = os.path.join( + self.output_mapping_data_folder, "json/" + ) + os.makedirs(output_data_json_folder, exist_ok=True) + json_file = os.path.join(output_data_json_folder, f"{self.doc_id}.json") + if os.path.exists(json_file): + logger.info( + f"The fund/ share of this document: {self.doc_id} has been mapped, loading data from {json_file}" + ) + with open(json_file, "r", encoding="utf-8") as f: + doc_mapping_data = json.load(f) + if self.doc_source == "aus_prospectus": + output_data_folder_splits = output_data_json_folder.split("output") + if len(output_data_folder_splits) == 2: + merged_data_folder = f'{output_data_folder_splits[0]}output/merged_data/docs/' + os.makedirs(merged_data_folder, exist_ok=True) + + merged_data_json_folder = os.path.join(merged_data_folder, "json/") + os.makedirs(merged_data_json_folder, exist_ok=True) + + merged_data_excel_folder = os.path.join(merged_data_folder, "excel/") + os.makedirs(merged_data_excel_folder, exist_ok=True) + + merged_data_file = os.path.join(merged_data_json_folder, f"merged_{self.doc_id}.json") + if os.path.exists(merged_data_file): + with open(merged_data_file, "r", encoding="utf-8") as f: + merged_data_list = json.load(f) + return merged_data_list + else: + data_mapping = DataMapping( + self.doc_id, + self.datapoints, + data_from_gpt, + self.document_mapping_info_df, + self.output_mapping_data_folder, + self.doc_source, + compare_with_provider=self.compare_with_provider + ) + merged_data_list = data_mapping.merge_output_data_aus_prospectus(doc_mapping_data, + merged_data_json_folder, + merged_data_excel_folder) + return merged_data_list + else: + return doc_mapping_data + """ + doc_id, + datapoints: list, + raw_document_data_list: list, + document_mapping_info_df: pd.DataFrame, + output_data_folder: str, + """ + data_mapping = DataMapping( + self.doc_id, + self.datapoints, + data_from_gpt, + self.document_mapping_info_df, + self.output_mapping_data_folder, + self.doc_source, + compare_with_provider=self.compare_with_provider + ) + return data_mapping.mapping_raw_data_entrance() + + +def filter_pages(doc_id: str, pdf_folder: str, doc_source: str) -> None: + logger.info(f"Filter EMEA AR PDF pages for doc_id: {doc_id}") + emea_ar_parsing = EMEA_AR_Parsing( + doc_id, doc_source=doc_source, pdf_folder=pdf_folder + ) + datapoint_page_info, result_details = emea_ar_parsing.get_datapoint_page_info() + return datapoint_page_info, result_details + + +def extract_data( + doc_id: str, + doc_source: str, + pdf_folder: str, + output_data_folder: str, + extract_way: str = "text", + re_run: bool = False, +) -> None: + logger.info(f"Extract EMEA AR data for doc_id: {doc_id}") + emea_ar_parsing = EMEA_AR_Parsing( + doc_id, + doc_source=doc_source, + pdf_folder=pdf_folder, + output_extract_data_folder=output_data_folder, + extract_way=extract_way, + ) + data_from_gpt, annotation_list = emea_ar_parsing.extract_data(re_run) + return data_from_gpt, annotation_list + + +def batch_extract_data( + pdf_folder: str, + doc_source: str = "emea_ar", + output_child_folder: str = r"/data/emea_ar/output/extract_data/docs/", + output_total_folder: str = r"/data/emea_ar/output/extract_data/total/", + extract_way: str = "text", + special_doc_id_list: list = None, + re_run: bool = False, +) -> None: + pdf_files = glob(pdf_folder + "*.pdf") + doc_list = [] + if special_doc_id_list is not None and len(special_doc_id_list) > 0: + doc_list = special_doc_id_list + + if len(doc_list) == 0: + logger.info(f"No special doc_id list provided, extracting all documents in {pdf_folder}") + return + + result_list = [] + for pdf_file in tqdm(pdf_files): + pdf_base_name = os.path.basename(pdf_file) + doc_id = pdf_base_name.split(".")[0] + if doc_list is not None and doc_id not in doc_list: + continue + data_from_gpt = extract_data( + doc_id=doc_id, + doc_source=doc_source, + pdf_folder=pdf_folder, + output_data_folder=output_child_folder, + extract_way=extract_way, + re_run=re_run, + ) + result_list.extend(data_from_gpt) + + if special_doc_id_list is None or len(special_doc_id_list) == 0: + result_df = pd.DataFrame(result_list) + result_df.reset_index(drop=True, inplace=True) + + logger.info(f"Saving the result to {output_total_folder}") + os.makedirs(output_total_folder, exist_ok=True) + time_stamp = time.strftime("%Y%m%d%H%M%S", time.localtime()) + output_file = os.path.join( + output_total_folder, + f"extract_data_info_{len(pdf_files)}_documents_{time_stamp}.xlsx", + ) + with pd.ExcelWriter(output_file) as writer: + result_df.to_excel(writer, index=False, sheet_name="extract_data_info") + + +def test_translate_pdf(): + from core.data_translate import Translate_PDF + + pdf_file = r"/data/emea_ar/pdf/451063582.pdf" + output_folder = r"/data/translate/output/" + translate_pdf = Translate_PDF(pdf_file, output_folder) + translate_pdf.start_job() + + +if __name__ == "__main__": + os.environ["SSL_CERT_FILE"] = certifi.where() + + doc_source = "aus_prospectus" + re_run = True + extract_way = "text" + if doc_source == "aus_prospectus": + special_doc_id_list = ["539266874"] + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_child_folder: str = ( + r"/data/aus_prospectus/output/extract_data/docs/" + ) + output_total_folder: str = ( + r"/data/aus_prospectus/output/extract_data/total/" + ) + elif doc_source == "emea_ar": + special_doc_id_list = ["514636993"] + pdf_folder: str = r"/data/emea_ar/pdf/" + output_child_folder: str = ( + r"/data/emea_ar/output/extract_data/docs/" + ) + output_total_folder: str = ( + r"/data/emea_ar/output/extract_data/total/" + ) + else: + raise ValueError(f"Invalid doc_source: {doc_source}") + + batch_extract_data( + pdf_folder=pdf_folder, + doc_source=doc_source, + output_child_folder=output_child_folder, + output_total_folder=output_total_folder, + extract_way=extract_way, + special_doc_id_list=special_doc_id_list, + re_run=re_run, + ) + + diff --git a/performance.ipynb b/performance.ipynb index 778901b..d3049d2 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 15, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ "\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", "# path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/next_round/next_round_6_documents_ground_truth_with_mapping.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250402185144.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250403221414.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250328035602.xlsx\"\n", "# path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_6_documents_by_text_20250331180753.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -363,56 +363,56 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9421 \t0.8981 \t0.9906 \t0.8905 \t475 \t423 \t0 \t48 \t4 \n", - "management_fee \t0.9583 \t0.9278 \t0.9909 \t0.9200 \t475 \t437 \t0 \t34 \t4 \n", - "performance_fee_costs \t0.9060 \t0.8852 \t0.9278 \t0.8821 \t317 \t270 \t149 \t35 \t21 \n", - "interposed_vehicle_performance_fee_cost \t0.9114 \t0.8372 \t1.0000 \t0.9705 \t73 \t72 \t389 \t14 \t0 \n", - "administration_fees \t0.9936 \t0.9873 \t1.0000 \t0.9979 \t78 \t78 \t396 \t1 \t0 \n", - "total_annual_dollar_based_charges \t0.9412 \t0.8889 \t1.0000 \t0.9811 \t72 \t72 \t394 \t9 \t0 \n", - "buy_spread \t0.9391 \t0.9156 \t0.9639 \t0.9053 \t387 \t347 \t83 \t32 \t13 \n", - "sell_spread \t0.9377 \t0.9129 \t0.9638 \t0.9032 \t387 \t346 \t83 \t33 \t13 \n", - "minimum_initial_investment \t0.9735 \t0.9821 \t0.9649 \t0.9621 \t342 \t330 \t127 \t6 \t12 \n", - "benchmark_name \t0.9268 \t0.8941 \t0.9620 \t0.9495 \t167 \t152 \t299 \t18 \t6 \n", - "TOTAL \t0.9430 \t0.9129 \t0.9764 \t0.9362 \t2773 \t2527 \t1920 \t230 \t73 \n", - "Total Shares Matched - 427\n", - "Total Shares Not Matched - 90\n", - "Percentage of Shares Matched - 82.59187620889749\n", - "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Platinum International Trust-EF/Sel', 'OnePath OA IP-Platinum International Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -NE', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'Telstra Growth Pen', 'MyNorth Index Defensive', 'MLC MKPFPR - Altrinsic Global Eq Trust', 'MLC MKPFPR - BlackRock Global Allocation', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MKPFPR - MLC - Platinum Global Fund', 'MLC MasterKey Pension Fundamentals - Perpetual Australian Share', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKPF - Perpetual WS Ethical SRI Fund', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKPFPR - Platinum Asia Fund', 'MLC MKSF - Platinum Asia Fund', 'MLC MKPF - Platinum International Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKPF - PM CAPITAL Global Companies', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKPF - Schroder WS Australian Equity', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'Australian Unity Inv Wholesale Deposits Fund', 'Lifeplan Investment Bond Lifeplan Capital Guaranteed', 'Dimensional Australian Core Equity Trust', 'CFS FC ESup-CFS Diversified Fix Int', 'FC W Pen-CFS TTR Conservative', 'FC W Pen-CFS TTR Diversified', 'FC W Pen-CFS TTR High Growth', 'FC W Pen-CFS TTR Moderate', 'FC W Pen-CFS TTR Growth', 'FC W Pen-CFS TTR Defensive', 'CFS MIF-Geared Share NEF', 'Dimensional Australia Core Equity Trust - Active ETF']\n", + "management_fee_and_costs \t0.9338 \t0.8899 \t0.9823 \t0.8758 \t443 \t388 \t0 \t48 \t7 \n", + "management_fee \t0.9576 \t0.9335 \t0.9831 \t0.9187 \t443 \t407 \t0 \t29 \t7 \n", + "performance_fee_costs \t0.9157 \t0.8964 \t0.9358 \t0.8849 \t316 \t277 \t115 \t32 \t19 \n", + "interposed_vehicle_performance_fee_cost \t0.9182 \t0.8488 \t1.0000 \t0.9707 \t73 \t73 \t357 \t13 \t0 \n", + "administration_fees \t0.9930 \t0.9861 \t1.0000 \t0.9977 \t71 \t71 \t371 \t1 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t71 \t71 \t372 \t0 \t0 \n", + "buy_spread \t0.9436 \t0.9138 \t0.9755 \t0.9142 \t352 \t318 \t87 \t30 \t8 \n", + "sell_spread \t0.9421 \t0.9109 \t0.9754 \t0.9120 \t352 \t317 \t87 \t31 \t8 \n", + "minimum_initial_investment \t0.9755 \t0.9803 \t0.9708 \t0.9661 \t308 \t299 \t129 \t6 \t9 \n", + "benchmark_name \t0.9053 \t0.8600 \t0.9556 \t0.9391 \t147 \t129 \t287 \t21 \t6 \n", + "TOTAL \t0.9485 \t0.9220 \t0.9778 \t0.9379 \t2576 \t2350 \t1805 \t211 \t64 \n", + "Total Shares Matched - 395\n", + "Total Shares Not Matched - 121\n", + "Percentage of Shares Matched - 76.55038759689923\n", + "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Balanced Fund – Retail Units', 'Mercer Multi-manager Conservative Fund – Retail Units', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Platinum International Trust-EF/Sel', 'OnePath OA IP-Platinum International Trust-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -NE', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPFPR - Altrinsic Global Eq Trust', 'MLC MKPFPR - BlackRock Global Allocation', 'MLC MKPF - Hedged Global Share Fund', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MKPFPR - MLC - Platinum Global Fund', 'MLC MasterKey Pension Fundamentals - Perpetual Australian Share', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKPF - Perpetual WS Ethical SRI Fund', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'MLC MKPFPR - Platinum Asia Fund', 'MLC MKSF - Platinum Asia Fund', 'MLC MKPF - Platinum International Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKPF - PM CAPITAL Global Companies', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKPF - Schroder WS Australian Equity', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'Australian Unity Inv Wholesale Deposits Fund', 'Lifeplan Investment Bond Lifeplan Capital Guaranteed', 'Lifeplan Investment Bond Perpetual Balanced Growth', 'Lifeplan Investment Bond Perpetual Conservative Growth', 'Lifeplan Investment Bond Perpetual Industrial Share', 'Lifeplan Investment Bond Vanguard® Australian Shares Index', 'Dimensional Australian Core Equity Trust', 'FC W Pen-CFS TTR Conservative', 'FC W Pen-CFS TTR Diversified', 'FC W Pen-CFS TTR High Growth', 'FC W Pen-CFS TTR Property Securities', 'FC W Pen-CFS TTR Moderate', 'FC W Pen-CFS TTR Balanced', 'FC W Pen-CFS TTR Growth', 'FC W Pen-CFS TTR Australian Small Companies', 'FC W Pen-CFS TTR Global Infrastructure Securities', 'FC W Pen-CFS TTR Fixed Interest', 'FC W Pen-CFS TTR Emerging Markets', 'FC W Pen-CFS TTR Defensive', 'CFS MIF-High Growth', 'CFS MIF-Property Securities', 'CFS MIF-Geared Share NEF', 'CFS MIF-Australian Share', 'CFS MIF-Geared Global Share', 'CFS MIF-Global Tech & Comm', 'CFS MIF-Stewart Inv Worldwide Leaders Sustainability', 'CFS MIF-Geared Share', 'CFS MIF-Diversified', 'CFS MIF-Janus Henderson Global Natural Resources Fund', 'CFS MIF-Macquarie Australian Emerging Companies', 'CFS MIF-Balanced', 'CFS MIF-Conservative', 'CFS MIF-Imputation', 'CFS MIF-Global Health & Biotech', 'Dimensional Australia Core Equity Trust - Active ETF']\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9580 \t0.9194 \t1.0000 \t0.9194 \t186 \t171 \t0 \t15 \t0 \n", - "management_fee \t0.9891 \t0.9785 \t1.0000 \t0.9785 \t186 \t182 \t0 \t4 \t0 \n", - "performance_fee_costs \t0.8743 \t0.8421 \t0.9091 \t0.8763 \t96 \t80 \t83 \t15 \t8 \n", - "interposed_vehicle_performance_fee_cost \t0.8814 \t0.7879 \t1.0000 \t0.9247 \t53 \t52 \t120 \t14 \t0 \n", - "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t9 \t9 \t177 \t0 \t0 \n", - "buy_spread \t0.9891 \t0.9837 \t0.9945 \t0.9785 \t184 \t181 \t1 \t3 \t1 \n", - "sell_spread \t0.9835 \t0.9728 \t0.9944 \t0.9677 \t184 \t179 \t1 \t5 \t1 \n", - "minimum_initial_investment \t0.9384 \t0.9580 \t0.9195 \t0.9032 \t149 \t137 \t31 \t6 \t12 \n", - "benchmark_name \t0.9278 \t0.8738 \t0.9890 \t0.9247 \t100 \t90 \t82 \t13 \t1 \n", - "TOTAL \t0.9491 \t0.9240 \t0.9785 \t0.9415 \t1147 \t1081 \t495 \t75 \t96 \n", - "Total Shares Matched - 181\n", - "Total Shares Not Matched - 10\n", - "Percentage of Shares Matched - 94.76439790575915\n", - "Not Matched Shares Name List - ['Dimensional Australian Core Equity Trust', 'CFS FC ESup-CFS Diversified Fix Int', 'FC W Pen-CFS TTR Conservative', 'FC W Pen-CFS TTR Diversified', 'FC W Pen-CFS TTR High Growth', 'FC W Pen-CFS TTR Moderate', 'FC W Pen-CFS TTR Growth', 'FC W Pen-CFS TTR Defensive', 'CFS MIF-Geared Share NEF', 'Dimensional Australia Core Equity Trust - Active ETF']\n", + "management_fee_and_costs \t0.9721 \t0.9458 \t1.0000 \t0.9458 \t166 \t157 \t0 \t9 \t0 \n", + "management_fee \t0.9909 \t0.9819 \t1.0000 \t0.9819 \t166 \t163 \t0 \t3 \t0 \n", + "performance_fee_costs \t0.8877 \t0.8737 \t0.9022 \t0.8735 \t95 \t83 \t62 \t12 \t9 \n", + "interposed_vehicle_performance_fee_cost \t0.8908 \t0.8030 \t1.0000 \t0.9217 \t53 \t53 \t100 \t13 \t0 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t3 \t3 \t163 \t0 \t0 \n", + "buy_spread \t0.9782 \t0.9573 \t1.0000 \t0.9578 \t164 \t157 \t2 \t7 \t0 \n", + "sell_spread \t0.9718 \t0.9451 \t1.0000 \t0.9458 \t164 \t155 \t2 \t9 \t0 \n", + "minimum_initial_investment \t0.9412 \t0.9524 \t0.9302 \t0.9096 \t129 \t120 \t31 \t6 \t9 \n", + "benchmark_name \t0.9017 \t0.8298 \t0.9873 \t0.8976 \t89 \t78 \t71 \t16 \t1 \n", + "TOTAL \t0.9483 \t0.9210 \t0.9800 \t0.9371 \t1029 \t969 \t431 \t75 \t83 \n", + "Total Shares Matched - 161\n", + "Total Shares Not Matched - 29\n", + "Percentage of Shares Matched - 84.73684210526315\n", + "Not Matched Shares Name List - ['Dimensional Australian Core Equity Trust', 'FC W Pen-CFS TTR Conservative', 'FC W Pen-CFS TTR Diversified', 'FC W Pen-CFS TTR High Growth', 'FC W Pen-CFS TTR Property Securities', 'FC W Pen-CFS TTR Moderate', 'FC W Pen-CFS TTR Balanced', 'FC W Pen-CFS TTR Growth', 'FC W Pen-CFS TTR Australian Small Companies', 'FC W Pen-CFS TTR Global Infrastructure Securities', 'FC W Pen-CFS TTR Fixed Interest', 'FC W Pen-CFS TTR Emerging Markets', 'FC W Pen-CFS TTR Defensive', 'CFS MIF-High Growth', 'CFS MIF-Property Securities', 'CFS MIF-Geared Share NEF', 'CFS MIF-Australian Share', 'CFS MIF-Geared Global Share', 'CFS MIF-Global Tech & Comm', 'CFS MIF-Stewart Inv Worldwide Leaders Sustainability', 'CFS MIF-Geared Share', 'CFS MIF-Diversified', 'CFS MIF-Janus Henderson Global Natural Resources Fund', 'CFS MIF-Macquarie Australian Emerging Companies', 'CFS MIF-Balanced', 'CFS MIF-Conservative', 'CFS MIF-Imputation', 'CFS MIF-Global Health & Biotech', 'Dimensional Australia Core Equity Trust - Active ETF']\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9316 \t0.8842 \t0.9844 \t0.8720 \t289 \t252 \t0 \t33 \t4 \n", - "management_fee \t0.9375 \t0.8947 \t0.9846 \t0.8824 \t289 \t255 \t0 \t30 \t4 \n", - "performance_fee_costs \t0.9201 \t0.9048 \t0.9360 \t0.8858 \t221 \t190 \t66 \t20 \t13 \n", - "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t269 \t0 \t0 \n", - "administration_fees \t0.9928 \t0.9857 \t1.0000 \t0.9965 \t69 \t69 \t219 \t1 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t217 \t0 \t0 \n", - "buy_spread \t0.8901 \t0.8513 \t0.9326 \t0.8581 \t203 \t166 \t82 \t29 \t12 \n", - "sell_spread \t0.8930 \t0.8564 \t0.9330 \t0.8616 \t203 \t167 \t82 \t28 \t12 \n", - "minimum_initial_investment \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t193 \t193 \t96 \t0 \t0 \n", - "benchmark_name \t0.9254 \t0.9254 \t0.9254 \t0.9654 \t67 \t62 \t217 \t5 \t5 \n", - "TOTAL \t0.9491 \t0.9302 \t0.9696 \t0.9322 \t1626 \t1446 \t1248 \t146 \t146 \n", - "Total Shares Matched - 289\n", - "Total Shares Not Matched - 80\n", - "Percentage of Shares Matched - 78.31978319783198\n", - "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA Inv-OnePath Multi Asset Income EF', 'ANZ OA Inv-OnePath Multi Asset Income NEF', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMix Global Emerging Markets Share-NEF', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Platinum International Trust-EF/Sel', 'OnePath OA IP-Platinum International Trust-NEF', 'OnePath OA IP-Schroder Fixed Income-EF/Sel', 'OnePath OA IP-Schroder Fixed Income-NEF', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -NE', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'Telstra Growth Pen', 'MyNorth Index Defensive', 'MLC MKPFPR - Altrinsic Global Eq Trust', 'MLC MKPFPR - BlackRock Global Allocation', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MKPFPR - MLC - Platinum Global Fund', 'MLC MasterKey Pension Fundamentals - Perpetual Australian Share', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKPF - Perpetual WS Ethical SRI Fund', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - Perpetual Smll Co Fund No.2', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKPFPR - Platinum Asia Fund', 'MLC MKSF - Platinum Asia Fund', 'MLC MKPF - Platinum International Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKPF - PM CAPITAL Global Companies', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKPF - Schroder WS Australian Equity', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Pension Fundamentals (Pre Retirement) - MLC Aust Property Index', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'Australian Unity Inv Wholesale Deposits Fund', 'Lifeplan Investment Bond Lifeplan Capital Guaranteed']\n" + "management_fee_and_costs \t0.9094 \t0.8556 \t0.9706 \t0.8339 \t277 \t231 \t0 \t39 \t7 \n", + "management_fee \t0.9367 \t0.9037 \t0.9721 \t0.8809 \t277 \t244 \t0 \t26 \t7 \n", + "performance_fee_costs \t0.9282 \t0.9065 \t0.9510 \t0.8917 \t221 \t194 \t53 \t20 \t10 \n", + "interposed_vehicle_performance_fee_cost \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t20 \t20 \t257 \t0 \t0 \n", + "administration_fees \t0.9927 \t0.9855 \t1.0000 \t0.9964 \t68 \t68 \t208 \t1 \t0 \n", + "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t71 \t71 \t206 \t0 \t0 \n", + "buy_spread \t0.9122 \t0.8750 \t0.9527 \t0.8881 \t188 \t161 \t85 \t23 \t8 \n", + "sell_spread \t0.9153 \t0.8804 \t0.9529 \t0.8917 \t188 \t162 \t85 \t22 \t8 \n", + "minimum_initial_investment \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t179 \t179 \t98 \t0 \t0 \n", + "benchmark_name \t0.9107 \t0.9107 \t0.9107 \t0.9639 \t58 \t51 \t216 \t5 \t5 \n", + "TOTAL \t0.9505 \t0.9317 \t0.9710 \t0.9347 \t1547 \t1381 \t1208 \t136 \t128 \n", + "Total Shares Matched - 277\n", + "Total Shares Not Matched - 92\n", + "Percentage of Shares Matched - 75.06775067750678\n", + "Not Matched Shares Name List - ['SPDR® S&P World ex Australia Carbon Control Fund', 'Mercer Multi-manager Balanced Fund – Retail Units', 'Mercer Multi-manager Conservative Fund – Retail Units', 'Mercer Multi-manager Growth Fund – Retail Units', 'Mercer Multi-manager High Growth Fund – Retail Units', 'ANZ OA IP-OP Diversified Credit EF', 'ANZ OA IP-OP Diversified Credit NE', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond EF', 'OnePath ANZ OA IP-T. Rowe Price Dyna Gl Bond NE', 'OnePath OA IP- Pendal Monthly Income Plus-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-EF/Sel', 'OnePath OA IP-ANZ Cash Advantage-NEF', 'OnePath OA IP-Bentham Global Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-EF/Sel', 'OnePath OA IP-Kapstream Absolute Return Income Trust-NEF', 'OnePath OA IP-OnePath Active Growth Trust-NEF', 'OnePath OA IP-OnePath High Growth Trust-EF/Sel', 'OnePath OA IP-OnePath High Growth Trust-NEF', 'OnePath OA IP-OnePath Managed Growth Trust-EF/Sel', 'OnePath OA IP-OnePath Managed Growth Trust-NEF', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Fixed Interest Trust-NEF', 'OnePath OA IP-OptiMix Australian Share Trust-EF/Sel', 'OnePath OA IP-OptiMix Australian Share Trust-NEF', 'OnePath OA IP-OptiMix Global Emerging Markets Share-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-EF/Sel', 'OnePath OA IP-OptiMIx Global Share Trust-NEF', 'OnePath OA IP-OptiMix High Growth Trust-EF/Sel', 'OnePath OA IP-OptiMix High Growth Trust-NEF', 'OnePath OA IP-OptiMix Property Securities Trust-EF/Sel', 'OnePath OA IP-OptiMix Property Securities Trust-NEF', 'OnePath OA IP-Perpetual Conservative Growth Trust-EF/Sel', 'OnePath OA IP-Perpetual Conservative Growth Trust-NEF', 'OnePath OA IP-Platinum International Trust-EF/Sel', 'OnePath OA IP-Platinum International Trust-NEF', 'OnePath OA IP-UBS Defensive Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-EF/Sel', 'OnePath OA IP-UBS Diversified Fixed Income Trust-NEF', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Ardea Real Outcome -NE', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -EF/Sel', 'OnePath OneAnswer Investment Portfolio - Barrow Hanley Concentrated Global Shares Hedged -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Balanced Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Conservative Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath Diversified Bond Index -NE', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -EF/Sel', 'OnePath OneAnswer Investment Portfolio - OnePath International Shares Index (Hedged) -NE', 'OnePath Schroder Real Return Trust (Entry Fee)', 'OnePath Schroder Real Return Trust (Nil Entry Fee)', 'Telstra Growth Pen', 'First Sentier Concentrated Aus Share', 'First Sentier Australian Small Companies', 'First Sentier Imputation', 'First Sentier Global Property Securities', 'First Sentier Australian Share', 'CFS FC-Investors Mutual Future Leaders', 'Stewart Worldwide Leaders Sustainability', 'First Sentier Property Securities', 'MyNorth Index Defensive', 'MLC MKPFPR - Altrinsic Global Eq Trust', 'MLC MKPFPR - BlackRock Global Allocation', 'MLC MKPF - Hedged Global Share Fund', 'MLC MKPF - Inflation Plus - Conservative', 'MLC MKPFPR - MLC - Platinum Global Fund', 'MLC MasterKey Pension Fundamentals - Perpetual Australian Share', 'MLC MasterKey Super Fundamentals - Perpetual Australian Share', 'MLC MKPF - Perpetual WS Ethical SRI Fund', 'MLC MKSF - Perpetual WS Ethical SRI Fund', 'MLC MasterKey Super Fundamentals - Perpetual Small Co Fund No.2', 'MLC MKPF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKSF - PIMCO Div. Fixed Interest Wholesale Class', 'MLC MKPF - PIMCO Global Bond Wholesale Class', 'MLC MKPFPR - Platinum Asia Fund', 'MLC MKSF - Platinum Asia Fund', 'MLC MKPF - Platinum International Fund', 'MLC MKSF - Platinum International Fund', 'MLC MKPF - PM CAPITAL Global Companies', 'MLC MKSF - PM CAPITAL Global Companies', 'MLC MKPF - Schroder WS Australian Equity', 'MLC MKSF - Schroder WS Australian Equity', 'MLC MasterKey Super Fundamentals - MLC Australian Property Index', 'MLC MKSF - Vanguard Intl Shr Indx (Hgd)', 'MLC MKSF - Vanguard Intl Shr Indx', 'Australian Unity Inv Wholesale Deposits Fund', 'Lifeplan Investment Bond Lifeplan Capital Guaranteed', 'Lifeplan Investment Bond Perpetual Balanced Growth', 'Lifeplan Investment Bond Perpetual Conservative Growth', 'Lifeplan Investment Bond Perpetual Industrial Share', 'Lifeplan Investment Bond Vanguard® Australian Shares Index']\n" ] } ], diff --git a/test_k_shape.py b/test_k_shape.py new file mode 100644 index 0000000..af62884 --- /dev/null +++ b/test_k_shape.py @@ -0,0 +1,77 @@ +import pandas as pd +import numpy as np +import sys +import os + +# 添加项目路径 +sys.path.append('crypto_quant') + +from crypto_quant.core.biz.metrics_calculation import MetricsCalculation + +def test_k_shape(): + # 创建测试数据 + test_data = pd.DataFrame({ + 'open': [9.3030000000], + 'high': [9.3030000000], + 'low': [9.3020000000], + 'close': [9.3020000000] + }) + + print("测试数据:") + print(test_data) + print() + + # 计算基本特征 + test_data['high_low_diff'] = test_data['high'] - test_data['low'] + test_data['open_close_diff'] = abs(test_data['close'] - test_data['open']) + test_data['open_close_fill'] = test_data['open_close_diff'] / test_data['high_low_diff'] + test_data['price_range_ratio'] = test_data['high_low_diff'] / test_data['close'] * 100 + + print("计算的特征:") + print(f"high_low_diff: {test_data['high_low_diff'].iloc[0]}") + print(f"open_close_diff: {test_data['open_close_diff'].iloc[0]}") + print(f"open_close_fill: {test_data['open_close_fill'].iloc[0]}") + print(f"price_range_ratio: {test_data['price_range_ratio'].iloc[0]}%") + print() + + # 检查"一字"条件 + price_range_ratio = test_data['price_range_ratio'].iloc[0] + open_close_fill = test_data['open_close_fill'].iloc[0] + + print("条件检查:") + print(f"price_range_ratio < 0.01: {price_range_ratio < 0.01}") + print(f"open_close_fill > 0.9: {open_close_fill > 0.9}") + print() + + # 使用MetricsCalculation类 + mc = MetricsCalculation() + + # 为了测试,我们需要创建一个有足够数据的DataFrame + # 复制测试数据多次以创建滚动窗口 + extended_data = pd.concat([test_data] * 25, ignore_index=True) + + # 运行set_k_shape函数 + result = mc.set_k_shape(extended_data.copy()) + + print("分类结果:") + print(f"k_shape: {result['k_shape'].iloc[0]}") + print() + + # 详细分析为什么没有被分类为"一字" + print("详细分析:") + print(f"价格范围比例: {price_range_ratio:.6f}%") + print(f"实体占比: {open_close_fill:.6f}") + print() + + if price_range_ratio < 0.01: + print("✓ 满足价格范围比例 < 0.01% 的条件") + else: + print(f"✗ 不满足价格范围比例 < 0.01% 的条件 (实际: {price_range_ratio:.6f}%)") + + if open_close_fill > 0.9: + print("✓ 满足实体占比 > 0.9 的条件") + else: + print(f"✗ 不满足实体占比 > 0.9 的条件 (实际: {open_close_fill:.6f})") + +if __name__ == "__main__": + test_k_shape() \ No newline at end of file