diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py index 4b0ec25..3ff701a 100644 --- a/core/auz_nz/hybrid_solution_script.py +++ b/core/auz_nz/hybrid_solution_script.py @@ -628,16 +628,16 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc cleaned_unmatched_db_list = remove_stopwords_nltk(cleaned_unmatched_db_list) cleaned_unmatched_db_list = remove_special_characters(cleaned_unmatched_db_list) prompt_context = f""" - {prompt_instruction} + {prompt_instruction} - provider_name: {provider_name} + provider_name: {provider_name} - prediction_fund: - {cleaned_unmatched_pred_list} - - true_fund: - {cleaned_unmatched_db_list} - """ + prediction_fund: + {cleaned_unmatched_pred_list} + + true_fund: + {cleaned_unmatched_db_list} + """ # print(f"\ncleaned_unmatched_pred_list: ",cleaned_unmatched_pred_list) # print(f"cleaned_unmatched_db_list: ",cleaned_unmatched_db_list) # llm_response = get_llm_response(prompt_context) @@ -660,35 +660,35 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc # cleaned_response = llm_response['response'].strip("```json").strip("```").replace('\n', '') # llm_result = json.loads(cleaned_response) # logger.info(f"\n\n llm_result: {llm_result}") - for k,v in llm_result.items(): + for pred_name,db_name in llm_result.items(): # print("k: ",k) # print("v: ",v) og_db_index=-1 # og_pred_index = -1 og_pred_index_list = [] - if k in cleaned_unmatched_pred_list: + if pred_name in cleaned_unmatched_pred_list: for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): - if c_item==k: + if c_item==pred_name: og_pred_index_list.append(c_idx) # og_pred_index = cleaned_unmatched_pred_list.index(k) if len(og_pred_index_list) == 0: # sometimes, the raw name and db name reversed from the LLM response - if v in cleaned_unmatched_pred_list and k in cleaned_unmatched_db_list: + if db_name in cleaned_unmatched_pred_list and pred_name in cleaned_unmatched_db_list: for c_idx, c_item in enumerate(cleaned_unmatched_pred_list): - if c_item==v: + if c_item==db_name: og_pred_index_list.append(c_idx) # og_pred_index = cleaned_unmatched_pred_list.index(v) - og_db_index = cleaned_unmatched_db_list.index(k) + og_db_index = cleaned_unmatched_db_list.index(pred_name) # v and k are swapped - temp = v - v = k - k = temp + temp = db_name + db_name = pred_name + pred_name = temp if len(og_pred_index_list)==0: continue # og_db_index = cleaned_unmatched_db_list.index(v) - if og_db_index == -1 and v in cleaned_unmatched_db_list: - og_db_index = cleaned_unmatched_db_list.index(v) + if og_db_index == -1 and db_name in cleaned_unmatched_db_list: + og_db_index = cleaned_unmatched_db_list.index(db_name) # print("og_db_index: ",og_db_index, cleaned_unmatched_db_list) # print("unmatched_db_list: ",unmatched_db_list) @@ -697,7 +697,7 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc if i['pred_fund']==unmatched_pred_list[og_pred_index]: if og_db_index!=-1: i['db_fund']=unmatched_db_list[og_db_index] - i['cleaned_db_fund_name'] = v + i['cleaned_db_fund_name'] = db_name final_result.update({unmatched_pred_list[og_pred_index]:unmatched_db_list[og_db_index]}) else: i['db_fund'] = '' @@ -705,8 +705,8 @@ def final_function_to_match(doc_id, pred_list, db_list, provider_name, doc_sourc final_result.update({unmatched_pred_list[og_pred_index]:""}) i['llm_clean_pred_list'] = cleaned_unmatched_pred_list i['llm_clean_db_list'] = cleaned_unmatched_db_list, - i['llm_pred_fund'] = k - i['llm_matched_db_name'] = v + i['llm_pred_fund'] = pred_name + i['llm_matched_db_name'] = db_name i['llm_result'] = llm_result break diff --git a/core/data_extraction.py b/core/data_extraction.py index 277c1f3..4445ecd 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -11,7 +11,7 @@ from utils.sql_query_util import query_document_fund_mapping, query_investment_b from utils.logger import logger from utils.biz_utils import add_slash_to_text_as_regex, clean_text, \ get_most_similar_name, remove_abundant_data, replace_special_table_header - +from utils.similarity import Similarity class DataExtraction: def __init__( @@ -511,7 +511,7 @@ class DataExtraction: raw_name_list = list(raw_name_dict.keys()) raw_name_as_production_name = None for raw_name in raw_name_list: - if raw_name.lower() in self.document_production.lower(): + if self.is_production_name(raw_name): raw_name_as_production_name = raw_name break datapoint_list_with_production_name = [] @@ -532,7 +532,7 @@ class DataExtraction: fund_name = data_item.get("fund_name", "") share_name = data_item.get("share_name", "") raw_name = self.get_raw_name(fund_name, share_name) - if raw_name.lower() in self.document_production.lower(): + if self.is_production_name(raw_name): dp_keys = [key for key in keys if key not in ["fund_name", "share_name", "management_fee_and_costs", @@ -584,6 +584,15 @@ class DataExtraction: extract_data["data"].remove(remove_item) return data_list, datapoint_list_with_production_name + def is_production_name(self, text: str): + if text.lower() in self.document_production.lower(): + return True + simlarity_util = Similarity() + similarity = simlarity_util.edit_distance_similarity(text, self.document_production) + if similarity > 0.93: + return True + return False + def remove_duplicate_data(self, data_list: list): """ The purpose is to remove duplicate data in the different pages. @@ -821,7 +830,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [4, 5]: + # if page_num not in [14, 15]: # continue if page_num in handled_page_num_list: continue diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 22f8105..e9ab8c9 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -234,7 +234,8 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}", "\n", - "F. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", + "F. If columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\" appear, please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\" for EACH SPECIFIC investment option. ", + "DO NOT assume these values apply to other investment options mentioned elsewhere in the context or from provided examples.", "---Example 1 Start---", "\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n", "---Example 1 End---", @@ -337,7 +338,16 @@ "Pre-retirement pension \nWe generally calculate \nand deduct this fee daily when unit \nprices are determined. \nHigh Growth 0.48%, Growth 0.50%", "---Example 2 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}" + "{\"data\": [{\"fund name\": \"Pre-retirement pension High Growth\", \"share name\": \"Pre-retirement pension High Growth\", \"management_fee_and_costs\": 0.48, \"management_fee\": 0.48}, {\"fund name\": \"Pre-retirement pension Growth\", \"share name\": \"Pre-retirement pension Growth\", \"management_fee_and_costs\": 0.50, \"management_fee\": 0.50}]}", + "L. DO NOT extract management fees from \"Cost of product\" summaries. ", + "\"Cost of product\" figures should not be treated as 'Investment fees and costs'.", + "---Example Start---", + "Investment option Cost of product \nCash $141.00", + "---Example End---", + "FOUND \"Cost of product\", IGNORE ALL OF INFORMATION BELOW IT!!! JUST RETURN EMPTY RESPONSE!!!", + "The output should be:", + "{\"data\": []}", + "M. Do NOT infer or copy investment fees or management fees from examples provided for specific funds to other investment options. Only extract 'management_fee_and_costs' and 'management_fee' if explicitly stated separately for each investment option." ], "administration_fees":[ "Administration fees and costs and total annual dollar-based charges are share class level data.", @@ -406,7 +416,16 @@ "---Example 4 Start---", "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", "---Example 4 End---", - "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees." + "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees.", + "B. The administration fee and costs/ total annual dollar-based charges are with production name, other data points/ values are with specific fund/ share name(s).", + "---Example Start---", + "My Super \nType of fee or cost Amount How and when paid \nOngoing annual fees and costs 1 \nAdministration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a \nmaximum of $1,000 p.a.) \n$0.50 per week deducted from your account\nbalance at the end of each month or on exit.\nPercentage fee taken into account in the \ndaily calculation of unit prices. \nInvestment fees and costs \n2 \nOption % of option’s assets* \nFund1 0.12%\n", + "---Example End---", + "According to example, \"My Super\" is with \"Administration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a maximum of $1,000 p.a.) \n$0.50 per week deducted from your account balance at the end of each month or on exit.\"", + "so administration_fees is 0.17, total_annual_dollar_based_charges is 0.50 * 52 = 26, with production name: \"My Super\".", + "\"Fund1\" is with specific fund/ share name, so management_fee_and_costs and management_fee are: 0.12", + "The output should be:", + "{\"data\": [{\"fund name\": \"My Super\", \"share name\": \"My Super\", \"administration_fees\": 0.17, \"total_annual_dollar_based_charges\": 26}, {\"fund name\": \"Fund1\", \"share name\": \"Fund1\", \"management_fee_and_costs\": 0.12, \"management_fee\": 0.12}]}" ], "total_annual_dollar_based_charges": [ "Total annual dollar-based charges are share class level data.", @@ -845,19 +864,12 @@ }, "output_requirement": { "common": [ - "If possible, please extract fund name, share name, data points values as the output.", "If find fund name, and exist sub fund name, please output fund name + sub fund name, e.g. fund name is \"Black Rock European\", sub fund name is \"Growth\", the output fund name should be: \"Black Rock European Growth\".", "Only output the data point which with relevant value.", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", "Please also output the data point reported name in context.", - "Example:", - "---Example Start---", - "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\nAlphinity Sustainable Share Fund\n0.95\n0.60\n0.42\n1.55\n1.37\nAntipodes Global Fund\n1.20\n0.60\n0.42\n1.80\n1.62\n", - "---Example End---", - "Output:", - "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"management_fee_and_costs\": 1.37, \"management_fee\": 0.95, \"administration_fees\": 0.42}, {\"fund name\": \"Antipodes Global Fund\", \"share name\": \"Antipodes Global Fund\", \"management_fee_and_costs\": 1.62, \"management_fee\": 1.20, \"administration_fees\": 0.42}]}", "Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.", "The output should be JSON format, the format is like below example(s):" ], @@ -939,7 +951,8 @@ }, "end": [ "Only output JSON data.", - "Don't output the value which not exist in context.", + "Please re-check before output answer, DO NOT output the data point and value which not exist in context.", + "DO NOT use the example values from a representative fund (such as Balanced Growth) for other funds unless explicitly mentioned", "If can't find fund name or share class name in context, please output empty JSON data: {\"data\": []}" ] } \ No newline at end of file diff --git a/main.py b/main.py index 187ae91..1b5aa81 100644 --- a/main.py +++ b/main.py @@ -1452,7 +1452,7 @@ def get_aus_prospectus_document_category(): def test_post_adjust_extract_data(): - doc_id = "397107472" + doc_id = "462780211" pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1538,7 +1538,7 @@ if __name__ == "__main__": with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] document_mapping_file = r"/data/aus_prospectus/basic_information/next_round/next_round_6_documents_mapping.xlsx" - # special_doc_id_list = ["441280757"] + special_doc_id_list = ["462780211"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( diff --git a/performance.ipynb b/performance.ipynb index 9451e53..e90267b 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ @@ -57,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 32, "metadata": {}, "outputs": [], "source": [ @@ -79,7 +79,7 @@ "\n", " return headers, data\n", "\n", - "def index_data_by_key(data, key_index, secondary_key_index, header):\n", + "def index_data_by_key(data, header):\n", " \"\"\"Index data by primary and secondary keys (doc_id and sec_name).\"\"\"\n", " indexed_data = defaultdict(dict)\n", " \n", @@ -114,10 +114,12 @@ " value2 = convert_if_number(value2)\n", " return value1 == value2\n", "\n", - "def compare_data(ground_truth, generated_results, headers, doc_id_index, fund_name_index, intersection_list, funds_matched, funds_not_matched, document_list):\n", + "def compare_data(ground_truth, generated_results, headers, intersection_list, document_list):\n", " \"\"\"Compare data from two indexed sets, with the focus on matching generated results against ground truth.\"\"\"\n", " results = {}\n", - " funds_matched, funds_not_matched = 0, 0\n", + " share_name_list = []\n", + " not_matched_share_name_list = []\n", + " share_matched, share_not_matched = 0, 0\n", " # Initialize result dictionaries for each column except 'doc_id'\n", " for keys in headers:\n", " if keys != \"doc_id\":\n", @@ -180,13 +182,28 @@ " \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n", " message_list.append(message)\n", " results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n", - " funds_matched += 1\n", + " if sec_name not in share_name_list:\n", + " share_name_list.append(sec_name)\n", + " share_matched += 1\n", " else:\n", - " funds_not_matched += 1\n", + " if sec_name not in share_name_list:\n", + " share_name_list.append(sec_name)\n", + " if sec_name not in not_matched_share_name_list:\n", + " # If the share class is not found in the generated results, count it as not matched\n", + " # print(\"Share class not matched - \", sec_name, doc_id)\n", + " message = {\"data_point\": \"Share Class\", \"doc_id\": doc_id, \"sec_name\": sec_name, \n", + " \"truth\": \"\", \"generated\": \"\", \"error\": \"Share class not found in generated results\"}\n", + " message_list.append(message)\n", + " share_not_matched += 1\n", + " not_matched_share_name_list.append(sec_name)\n", " else:\n", " # If the entire document is not found, count all funds as not matched\n", - " funds_not_matched += len(secs)\n", - " return results, message_list, funds_matched, funds_not_matched\n", + " message = {\"data_point\": \"Document\", \"doc_id\": doc_id, \"sec_name\": \"\",\n", + " \"truth\": \"\", \"generated\": \"\", \"error\": \"Document not found in generated results\"}\n", + " message_list.append(message)\n", + " \n", + " # share_not_matched += len(secs)\n", + " return results, message_list, share_matched, share_not_matched, not_matched_share_name_list\n", "\n", "def clean_text(text: str):\n", " if text is None or len(text) == 0:\n", @@ -330,7 +347,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -352,9 +369,10 @@ "buy_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", "sell_spread \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t18 \t18 \t34 \t0 \t0 \n", "TOTAL \t0.9841 \t0.9697 \t1.0000 \t0.9698 \t258 \t247 \t106 \t11 \t0 \n", - "Total Funds Matched - 52\n", - "Total Funds Not Matched - 28\n", - "Percentage of Funds Matched - 65.0\n" + "Total Shares Matched - 52\n", + "Total Shares Not Matched - 18\n", + "Percentage of Shares Matched - 74.28571428571429\n", + "Not Matched Shares Name List - ['Vision Balanced Growth Pen', 'CFS FC W Pen-Ausbil Aust Active Equity', 'CFS FC W Pen-AXA IM TTR W Sust Eq', 'CFS FC W PSup-FirstRate Term Dep (10yr)', 'CFS FC W PSup-FirstRate Term Dep (15yr)', 'CFS FC W PSup-FirstRate Term Dep (2yr)', 'CFS FC W PSup-FirstRate Term Dep (3yr)', 'CFS FC W PSup-FirstRate Term Dep (5yr)', 'CFS FC W PSup-FirstRate Term Dep (7yr)', 'AV Australian Shares TTR', 'AV Balanced Growth TTR', 'AV Cash TTR', 'AV Conservative Growth TTR', 'AV Diversified Index TTR', 'AV Growth TTR', 'AV High Growth TTR', 'AV International Shares TTR', 'AV Stable Growth TTR']\n" ] } ], @@ -370,20 +388,17 @@ "6. Set F1-Score to the first column in the metrics table\n", "\"\"\"\n", "\n", - "funds_matched = 0\n", - "funds_not_matched = 0\n", - "\n", "# Load the files\n", "headers_gt, ground_truth_data = load_excel(path_ground_truth, 0)\n", "headers_gen, generated_results_data = load_excel(path_generated_results, 0)\n", "\n", "# Assuming doc_id is the first column and fund_name is the second column\n", - "doc_id_index = 0\n", - "fund_name_index = 1\n", + "# doc_id_index = 0\n", + "# fund_name_index = 1\n", "\n", "# Index the data\n", - "ground_truth_indexed = index_data_by_key(ground_truth_data, doc_id_index, fund_name_index, headers_gt)\n", - "generated_results_indexed = index_data_by_key(generated_results_data, doc_id_index, fund_name_index, headers_gen)\n", + "ground_truth_indexed = index_data_by_key(ground_truth_data, headers_gt)\n", + "generated_results_indexed = index_data_by_key(generated_results_data, headers_gen)\n", "\n", "intersection = set(headers_gen).intersection(headers_gt)\n", "\n", @@ -425,20 +440,21 @@ " \n", " print(\"All Providers Results: \")\n", " print(\"Document List File - \", document_list_file)\n", - " comparison_results, message_list, funds_matched, funds_not_matched = compare_data(ground_truth_indexed, \n", - " generated_results_indexed, \n", - " headers_gt, doc_id_index, \n", - " fund_name_index, \n", - " intersection_list,\n", - " funds_matched, \n", - " funds_not_matched,\n", - " document_list)\n", + " comparison_results, message_list, share_matched, \\\n", + " share_not_matched, not_matched_share_name_list = compare_data(ground_truth_indexed, \n", + " generated_results_indexed,\n", + " headers_gt,\n", + " intersection_list,\n", + " document_list)\n", " metrics_list = print_metrics_table(comparison_results)\n", - " print(\"Total Funds Matched - \" + str(funds_matched) + \"\\nTotal Funds Not Matched - \" + str(funds_not_matched))\n", - " print(\"Percentage of Funds Matched - \" + str((funds_matched/(funds_matched + funds_not_matched))*100))\n", + " print(\"Total Shares Matched - \" + str(share_matched) + \"\\nTotal Shares Not Matched - \" + str(share_not_matched))\n", + " print(\"Percentage of Shares Matched - \" + str((share_matched/(share_matched + share_not_matched))*100))\n", + " print(\"Not Matched Shares Name List - \", not_matched_share_name_list)\n", "\n", " metrics_df = pd.DataFrame(metrics_list)\n", " message_df = pd.DataFrame(message_list)\n", + " share_matched_data = {\"share_matched\": share_matched, \"share_not_matched\": share_not_matched, \"not_matched_share_name_list\": not_matched_share_name_list}\n", + " share_matched_df = pd.DataFrame([share_matched_data])\n", "\n", " output_metrics_folder = r\"/data/aus_prospectus/output/metrics_data/\"\n", " os.makedirs(output_metrics_folder, exist_ok=True)\n", @@ -452,14 +468,23 @@ " metrics_file_path = os.path.join(output_metrics_folder, metrics_file_name)\n", " with pd.ExcelWriter(metrics_file_path) as writer:\n", " metrics_df.to_excel(writer, sheet_name=\"metrics_data\", index=False)\n", - " message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n" + " message_df.to_excel(writer, sheet_name=\"message_data\", index=False)\n", + " share_matched_df.to_excel(writer, sheet_name=\"share_matched_data\", index=False)\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'data_point': 'performance_fee_costs', 'doc_id': 539791362, 'sec_name': 'REST High Growth Pension', 'truth': '0.6', 'generated': '0.08', 'error': 'Truth is not equal with generated'}\n" + ] + } + ], "source": [ "for message_list_element in message_list:\n", " if message_list_element[\"data_point\"] == \"performance_fee_costs\":\n", diff --git a/sample_documents/aus_prospectus_verify_3_documents_sample.txt b/sample_documents/aus_prospectus_verify_3_documents_sample.txt new file mode 100644 index 0000000..8e69c2c --- /dev/null +++ b/sample_documents/aus_prospectus_verify_3_documents_sample.txt @@ -0,0 +1,3 @@ +539999907 +455235248 +448576924 \ No newline at end of file