diff --git a/core/data_extraction.py b/core/data_extraction.py index b31f32b..f7d4609 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -108,6 +108,29 @@ class DataExtraction: pass return document_category, document_production + + def get_objective_fund_name(self, page_text: str) -> str: + fund_name = "" + if self.doc_source == "aus_prospectus": + objective_fund_name_prompts_file = os.path.join(self.instruction_folder, "objective_fund_name_prompts.json") + if not os.path.exists(objective_fund_name_prompts_file): + return fund_name + with open(objective_fund_name_prompts_file, "r", encoding="utf-8") as f: + objective_fund_name_prompt = "\n".join(json.load(f).get("prompts", [])) + if len(objective_fund_name_prompt) > 0: + prompts = f"Context: \n{page_text}\n\Instructions: \n{objective_fund_name_prompt}" + result, with_error = chat( + prompt=prompts, response_format={"type": "json_object"}, max_tokens=1000 + ) + response = result.get("response", "") + if not with_error: + try: + data = json.loads(response) + fund_name = data.get("fund_name", "") + except: + pass + return fund_name + def get_datapoint_page_info(self, datapoint_page_info: dict) -> dict: """ @@ -647,7 +670,7 @@ class DataExtraction: (mf_share_name.endswith(share_name) or share_name.endswith(mf_share_name))): if exist_complex_rule_keywords and \ ("interposed_vehicle_performance_fee_cost" in keys or "recoverable_expenses" in keys): - mfc["management_fee"] = management_fee + mf["management_fee"] = management_fee found = True break else: @@ -766,7 +789,7 @@ class DataExtraction: previous_page_datapoints = [] previous_page_fund_name = None for page_num, page_text in self.page_text_dict.items(): - # if page_num not in [74]: + # if page_num not in [25]: # continue if page_num in handled_page_num_list: continue @@ -1092,12 +1115,16 @@ class DataExtraction: diff_pages = [page_num - investment_objective_page for investment_objective_page in self.investment_objective_pages if investment_objective_page <= page_num] - if len(diff_pages) > 0 and diff_pages[-1] < 5: + if len(diff_pages) > 0 and diff_pages[-1] < 5 and diff_pages[-1] > 0: top_nearest_investment_objective_page = self.investment_objective_pages[len(diff_pages) - 1] top_nearest_investment_objective_text = self.page_text_dict.get(top_nearest_investment_objective_page, "") - if top_nearest_investment_objective_text in page_text: + + if top_nearest_investment_objective_text in page_text and \ + top_nearest_investment_objective_text != page_text: page_text = page_text.replace(top_nearest_investment_objective_text, "").strip() - pre_context = f"\nThe most recent investment objective page text which maybe with fund name is: \n{top_nearest_investment_objective_text}.\n" + pre_context_fund_name = self.get_objective_fund_name(top_nearest_investment_objective_text) + if pre_context_fund_name is not None and len(pre_context_fund_name) > 0: + pre_context = f"\nThe fund name for most recent investment objective page text is: \n{pre_context_fund_name}.\n" # If can't find previous investment objective text, add the fund names to be the pre-fix of page text page_text = f"{pre_context}\n{page_text}".strip() diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index 1cfa46a..5bd7782 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -375,7 +375,7 @@ "buy_spread": [ "A. Exclude reported name", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", - "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), ", + "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), Cost of product, ", "Estimated transaction costs offset by buy/sell spreads (% pa), ", "---Example 1 Start---", "Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n", @@ -388,6 +388,12 @@ "---Example 2 End---", "The data is about Transaction costs, should be excluded, the output for buy_spread and sell_spread should be:", "{\"data\": []}", + "\n", + "---Example 3 Start---", + "Fund name \nCost of product \nCFS Index Australian Bond Fund \n$155 \n", + "---Example 3 End---", + "The data is about Cost of product, should be excluded, the output for buy_spread and sell_spread should be:", + "{\"data\": []}", "B. Simple case with simple table structure:", "---Example 1 Start---", "Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n", @@ -416,12 +422,18 @@ "performance_fee_costs": [ "Performance fees is share class level data.", "A. If the performance fees is with the range, please ignore and output empty.", - "---Example 1 Start---", + "---Example Start---", "Performance fees \nAmounts deducted from your \ninvestment in relation to the \nperformance of the product \nEstimated to be 0.00% p.a. to 2.18% p.a. of the net \nasset value of the relevant investment option based \non a 5 year average. \nThe estimated performance fee based on an average \nof the previous 5 financial years for each investment \noption are shown on the table in the Performance \nfee section below.", - "---Example 1 End---", + "---Example End---", "The relevant values: 0.00 and 2.18, are in the range, so the output should be:", "{\"data\": []}", - "B. If with pure performance fee in table, please extract relevant values", + "B. If the table is only about Cost of product, should be excluded, ", + "---Example Start---", + "Fund name \nCost of product \nCFS Index Australian Bond Fund \n$155 \n", + "---Example End---", + "The data is about Cost of product, should be excluded, the output for Performance fees should be:", + "{\"data\": []}", + "C. If with pure performance fee in table, please extract relevant values", "---Example Start---", "\n\nFees and costs summary \nPlatinum Trust Funds \nType of fee or cost Amount How and when paid \nC Class and E Class* -\nStandard Fee Option \nP Class - Performance \nFee Option \nOngoing annual fees and costs \nPerformance fees \nAmounts deducted from your investment in \nrelation to the performance of the product. \nPlatinum International Fund Nil 0.15%\nPlatinum Global Fund (Long Only) Nil 0.24%\n", "---Example End---", @@ -429,8 +441,7 @@ "b. This example mentioned share classes, please output according to share class.", "The output should be", "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", - "C. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0" - + "D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0" ], "minimum_initial_investment": [ "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", diff --git a/instructions/aus_prospectus/objective_fund_name_prompts.json b/instructions/aus_prospectus/objective_fund_name_prompts.json new file mode 100644 index 0000000..3321c35 --- /dev/null +++ b/instructions/aus_prospectus/objective_fund_name_prompts.json @@ -0,0 +1,15 @@ +{ + "prompts": [ + "Get the fund name from document context. \n", + "The document context contains fund investment objective(s).\n", + "1. Please locate the last investment objective in the document context.\n", + "2. Please provide the relevant fund name for the last investment objective.\n", + "3. Usually, the fund name can be found in the several upon lines of the last investment objective.\n", + "----Example context start----", + "\n\nMLC Horizon 4 Balanced Portfolio \n\nThis option invests in a wide range of asset classes with a strong bias towards shares and other growth assets. It ’ s designed for members who \nare focused on higher returns and are willing to take on exposure to more volatile investments. \n\nMLC Horizon 4 Balanced Portfolio \nInvestment objective \nAims to grow by more than inflation +3% pa (after fees and tax) over 10 years. \nBenchmark \nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \nThe investment option may be \nsuited to you if... \nyou want your investment to exceed changes in the costs of living, over the long term \nyou want a higher emphasis on growth than stability \nyou understand returns may be higher or lower than its objective, and \nyou value active management. \n\n3 \n\nMLC MasterKey Super & Pension Fundamentals Product Disclosure Statement", + "----Example context end----", + "The output should be as JSON format:", + "{\"fund_name\": \"MLC Horizon 4 Balanced Portfolio\"}\n", + "Answer:\n" + ] +} \ No newline at end of file diff --git a/main.py b/main.py index f7d3310..4d308c5 100644 --- a/main.py +++ b/main.py @@ -1452,7 +1452,7 @@ def get_aus_prospectus_document_category(): def test_post_adjust_extract_data(): - doc_id = "539266814" + doc_id = "397107472" pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_extract_data_child_folder: str = ( @@ -1526,8 +1526,8 @@ if __name__ == "__main__": # special_doc_id_list = ["553242411"] - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1547,20 +1547,7 @@ if __name__ == "__main__": # document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx" # document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx" document_mapping_file = r"/data/aus_prospectus/basic_information/46_documents/aus_prospectus_46_documents_mapping.xlsx" - # special_doc_id_list: list = ["410899007", "539266880", "539266817", - # "539261734", "539266893"] - # special_doc_id_list: list = ["530101994", - # "539241700", - # "539261734", - # "539266814", - # "539266817", - # "539266874", - # "539266880", - # "539266893", - # "544886057", - # "550769189", - # "553449663"] - # special_doc_id_list = ["521606755"] + # special_doc_id_list = ["420339794"] # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" diff --git a/performance.ipynb b/performance.ipynb index 559c61d..efc72c8 100644 --- a/performance.ipynb +++ b/performance.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 33, "metadata": {}, "outputs": [ { @@ -44,53 +44,53 @@ "All Providers Results: \n", "Document List File - None\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9463 \t0.9087 \t0.9873 \t0.8986 \t431 \t388 \t2 \t39 \t5 \n", - "management_fee \t0.9502 \t0.9157 \t0.9874 \t0.9055 \t431 \t391 \t2 \t36 \t5 \n", - "performance_fee_costs \t0.8614 \t0.8473 \t0.8759 \t0.8272 \t281 \t233 \t126 \t42 \t33 \n", - "interposed_vehicle_performance_fee_cost \t0.9726 \t0.9467 \t1.0000 \t0.9908 \t72 \t71 \t359 \t4 \t0 \n", - "administration_fees \t0.9935 \t0.9872 \t1.0000 \t0.9977 \t77 \t77 \t356 \t1 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t362 \t0 \t0 \n", - "buy_spread \t0.9322 \t0.9066 \t0.9593 \t0.8894 \t370 \t330 \t56 \t34 \t14 \n", - "sell_spread \t0.9352 \t0.9121 \t0.9595 \t0.8940 \t370 \t332 \t56 \t32 \t14 \n", - "minimum_initial_investment \t0.9577 \t0.9474 \t0.9684 \t0.9378 \t322 \t306 \t101 \t17 \t10 \n", - "benchmark_name \t0.8067 \t0.7562 \t0.8643 \t0.8664 \t154 \t121 \t255 \t39 \t19 \n", - "TOTAL \t0.9356 \t0.9128 \t0.9602 \t0.9207 \t2580 \t2321 \t1675 \t244 \t100 \n", - "Total Funds Matched - 434\n", - "Total Funds Not Matched - 131\n", - "Percentage of Funds Matched - 76.8141592920354\n", + "management_fee_and_costs \t0.9395 \t0.9002 \t0.9823 \t0.8858 \t437 \t388 \t0 \t43 \t7 \n", + "management_fee \t0.9496 \t0.9188 \t0.9826 \t0.9041 \t437 \t396 \t0 \t35 \t7 \n", + "performance_fee_costs \t0.8597 \t0.8755 \t0.8445 \t0.8219 \t298 \t239 \t121 \t34 \t44 \n", + "interposed_vehicle_performance_fee_cost \t0.9362 \t0.9429 \t0.9296 \t0.9795 \t72 \t66 \t363 \t4 \t5 \n", + "administration_fees \t0.7862 \t0.9828 \t0.6552 \t0.9292 \t87 \t57 \t350 \t1 \t30 \n", + "total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9772 \t82 \t72 \t356 \t10 \t0 \n", + "buy_spread \t0.9374 \t0.9208 \t0.9547 \t0.8973 \t375 \t337 \t56 \t29 \t16 \n", + "sell_spread \t0.9418 \t0.9290 \t0.9551 \t0.9041 \t375 \t340 \t56 \t26 \t16 \n", + "minimum_initial_investment \t0.9518 \t0.9457 \t0.9579 \t0.9315 \t315 \t296 \t112 \t17 \t13 \n", + "benchmark_name \t0.8553 \t0.8418 \t0.8693 \t0.8973 \t166 \t133 \t260 \t25 \t20 \n", + "TOTAL \t0.9093 \t0.9135 \t0.9131 \t0.9128 \t2644 \t2324 \t1674 \t224 \t158 \n", + "Total Funds Matched - 438\n", + "Total Funds Not Matched - 127\n", + "Percentage of Funds Matched - 77.5221238938053\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_29_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9499 \t0.9096 \t0.9938 \t0.9045 \t177 \t161 \t0 \t16 \t1 \n", - "management_fee \t0.9529 \t0.9153 \t0.9939 \t0.9101 \t177 \t162 \t0 \t15 \t1 \n", - "performance_fee_costs \t0.8197 \t0.7979 \t0.8427 \t0.8146 \t91 \t75 \t70 \t19 \t14 \n", - "interposed_vehicle_performance_fee_cost \t0.9811 \t0.9630 \t1.0000 \t0.9888 \t53 \t52 \t124 \t2 \t0 \n", - "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t170 \t0 \t0 \n", - "buy_spread \t0.9738 \t0.9653 \t0.9824 \t0.9494 \t174 \t167 \t2 \t6 \t3 \n", - "sell_spread \t0.9767 \t0.9711 \t0.9825 \t0.9551 \t174 \t168 \t2 \t5 \t3 \n", - "minimum_initial_investment \t0.9185 \t0.9118 \t0.9254 \t0.8764 \t140 \t124 \t32 \t12 \t10 \n", - "benchmark_name \t0.8121 \t0.7528 \t0.8816 \t0.8258 \t86 \t67 \t80 \t22 \t9 \n", - "TOTAL \t0.9316 \t0.9096 \t0.9558 \t0.9139 \t1080 \t984 \t480 \t97 \t141 \n", - "Total Funds Matched - 178\n", - "Total Funds Not Matched - 18\n", - "Percentage of Funds Matched - 90.81632653061224\n", + "management_fee_and_costs \t0.9419 \t0.9059 \t0.9809 \t0.8902 \t172 \t154 \t0 \t16 \t3 \n", + "management_fee \t0.9547 \t0.9294 \t0.9814 \t0.9133 \t172 \t158 \t0 \t12 \t3 \n", + "performance_fee_costs \t0.8315 \t0.9024 \t0.7708 \t0.8266 \t97 \t74 \t69 \t8 \t22 \n", + "interposed_vehicle_performance_fee_cost \t0.9630 \t0.9286 \t1.0000 \t0.9769 \t53 \t52 \t117 \t4 \t0 \n", + "administration_fees \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t8 \t8 \t165 \t0 \t0 \n", + "buy_spread \t0.9699 \t0.9699 \t0.9699 \t0.9422 \t169 \t161 \t2 \t5 \t5 \n", + "sell_spread \t0.9760 \t0.9819 \t0.9702 \t0.9538 \t169 \t163 \t2 \t3 \t5 \n", + "minimum_initial_investment \t0.9027 \t0.9062 \t0.8992 \t0.8555 \t135 \t116 \t32 \t12 \t13 \n", + "benchmark_name \t0.8333 \t0.8025 \t0.8667 \t0.8497 \t85 \t65 \t82 \t16 \t10 \n", + "TOTAL \t0.9303 \t0.9252 \t0.9377 \t0.9120 \t1060 \t951 \t469 \t76 \t219 \n", + "Total Funds Matched - 173\n", + "Total Funds Not Matched - 23\n", + "Percentage of Funds Matched - 88.26530612244898\n", "All Providers Results: \n", "Document List File - ./sample_documents/aus_prospectus_17_documents_sample.txt\n", "Metric \tF1-Score \tPrecision \tRecall \tAccuracy \tSUPPORT \tTP \tTN \tFP \tFN \n", - "management_fee_and_costs \t0.9439 \t0.9080 \t0.9827 \t0.8945 \t254 \t227 \t2 \t23 \t4 \n", - "management_fee \t0.9482 \t0.9160 \t0.9828 \t0.9023 \t254 \t229 \t2 \t21 \t4 \n", - "performance_fee_costs \t0.8827 \t0.8729 \t0.8927 \t0.8359 \t190 \t158 \t56 \t23 \t19 \n", - "interposed_vehicle_performance_fee_cost \t0.9500 \t0.9048 \t1.0000 \t0.9922 \t19 \t19 \t235 \t2 \t0 \n", - "administration_fees \t0.9928 \t0.9857 \t1.0000 \t0.9961 \t69 \t69 \t186 \t1 \t0 \n", - "total_annual_dollar_based_charges \t1.0000 \t1.0000 \t1.0000 \t1.0000 \t72 \t72 \t184 \t0 \t0 \n", - "buy_spread \t0.8932 \t0.8534 \t0.9368 \t0.8477 \t196 \t163 \t54 \t28 \t11 \n", - "sell_spread \t0.8962 \t0.8586 \t0.9371 \t0.8516 \t196 \t164 \t54 \t27 \t11 \n", - "minimum_initial_investment \t0.9864 \t0.9733 \t1.0000 \t0.9805 \t182 \t182 \t69 \t5 \t0 \n", - "benchmark_name \t0.8000 \t0.7606 \t0.8438 \t0.8945 \t68 \t54 \t175 \t17 \t10 \n", - "TOTAL \t0.9293 \t0.9033 \t0.9576 \t0.9195 \t1500 \t1337 \t1017 \t147 \t200 \n", - "Total Funds Matched - 256\n", - "Total Funds Not Matched - 113\n", - "Percentage of Funds Matched - 69.37669376693766\n" + "management_fee_and_costs \t0.9379 \t0.8966 \t0.9832 \t0.8830 \t265 \t234 \t0 \t27 \t4 \n", + "management_fee \t0.9463 \t0.9119 \t0.9835 \t0.8981 \t265 \t238 \t0 \t23 \t4 \n", + "performance_fee_costs \t0.8730 \t0.8639 \t0.8824 \t0.8189 \t201 \t165 \t52 \t26 \t22 \n", + "interposed_vehicle_performance_fee_cost \t0.8485 \t1.0000 \t0.7368 \t0.9811 \t19 \t14 \t246 \t0 \t5 \n", + "administration_fees \t0.7597 \t0.9800 \t0.6203 \t0.8830 \t79 \t49 \t185 \t1 \t30 \n", + "total_annual_dollar_based_charges \t0.9351 \t0.8780 \t1.0000 \t0.9623 \t82 \t72 \t183 \t10 \t0 \n", + "buy_spread \t0.9096 \t0.8800 \t0.9412 \t0.8679 \t206 \t176 \t54 \t24 \t11 \n", + "sell_spread \t0.9124 \t0.8850 \t0.9415 \t0.8717 \t206 \t177 \t54 \t23 \t11 \n", + "minimum_initial_investment \t0.9863 \t0.9730 \t1.0000 \t0.9811 \t180 \t180 \t80 \t5 \t0 \n", + "benchmark_name \t0.8774 \t0.8831 \t0.8718 \t0.9283 \t81 \t68 \t178 \t9 \t10 \n", + "TOTAL \t0.8986 \t0.9151 \t0.8961 \t0.9075 \t1584 \t1373 \t1032 \t148 \t316 \n", + "Total Funds Matched - 265\n", + "Total Funds Not Matched - 104\n", + "Percentage of Funds Matched - 71.81571815718158\n" ] } ], @@ -114,7 +114,7 @@ "\"\"\"\n", "\n", "path_ground_truth = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx\"\n", - "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313153441.xlsx\"\n", + "path_generated_results = r\"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313224747.xlsx\"\n", "provider_mapping_file_path = r\"/data/aus_prospectus/ground_truth/phase2_file/46_documents/TopProvidersBiz.xlsx\"\n", "\n", "funds_matched = 0\n", @@ -198,36 +198,41 @@ " for i in intersection_list:\n", " for keys in imp_datapoints:\n", " if i == imp_datapoints_mapping[keys]:\n", + " truth = str(truth_values[i]).strip()\n", + " generated = str(generated_values[i]).strip()\n", " total = total +1\n", - " if truth_values[i] == \"\":\n", - " if truth_values[i] == generated_values[i]:\n", + " if truth == \"\":\n", + " if truth == generated:\n", " results[i][\"TN\"] = results[i][\"TN\"] + 1\n", " else:\n", " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", " # print(keys, \" - \" , doc_id, \" truth is null and generated - \", generated_values[i], sec_name) \n", - " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is null and generated is not null\"}\n", + " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n", + " \"truth\": truth, \"generated\": generated, \"error\": \"Truth is null and generated is not null\"}\n", " message_list.append(message) \n", " else:\n", - " if truth_values[i] == generated_values[i]:\n", + " if truth == generated:\n", " results[i][\"TP\"] = results[i][\"TP\"] + 1\n", - " elif generated_values[i] != \"\":\n", - " if i == \"benchmark_name\" and compare_text(truth_values[i], generated_values[i]):\n", + " elif generated != \"\":\n", + " if i == \"benchmark_name\" and compare_text(truth, generated):\n", " results[i][\"TP\"] = results[i][\"TP\"] + 1\n", " else:\n", " results[i][\"FP\"] = results[i][\"FP\"] + 1\n", " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", " # print(keys, \" - \" , doc_id, \" truth - \", truth_values[i], \" and generated - \", generated_values[i], \" \", sec_name)\n", - " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Truth is not equal with generated\"}\n", + " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n", + " \"truth\": truth, \"generated\": generated, \"error\": \"Truth is not equal with generated\"}\n", " message_list.append(message)\n", " else:\n", " results[i][\"FN\"] = results[i][\"FN\"] + 1\n", " # if \"Performance fee and cost\" in keys:\n", " debug = 0\n", " # print(keys, \" - \" , doc_id, \" generated is null and truth is - \", truth_values[i], sec_name)\n", - " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \"truth\": truth_values[i], \"generated\": generated_values[i], \"error\": \"Generated is null and truth is not null\"}\n", + " message = {\"data_point\": i, \"doc_id\": doc_id, \"sec_name\": sec_name, \n", + " \"truth\": truth, \"generated\": generated, \"error\": \"Generated is null and truth is not null\"}\n", " message_list.append(message)\n", " results[i][\"SUPPORT\"] = results[i][\"SUPPORT\"] + 1\n", " funds_matched += 1\n",