From 91530d608987ba96d27ce1aa4a8280a828400efb Mon Sep 17 00:00:00 2001
From: Blade He <Blade.He@morningstar.com>
Date: Fri, 20 Sep 2024 11:58:48 -0500
Subject: [PATCH] add more description for Performance Fees calculation rules

---
 core/data_extraction.py                       |   9 +-
 core/metrics.py                               | 117 ++++++++++++++----
 .../data_extraction_prompts_config.json       |  19 ++-
 main.py                                       |   3 +-
 4 files changed, 117 insertions(+), 31 deletions(-)

diff --git a/core/data_extraction.py b/core/data_extraction.py
index 6817d6b..ab4380d 100644
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@@ -556,6 +556,8 @@ class DataExtraction:
                     if data.get("share name", "") == "":
                         remove_list.append(data)
                         break
+                if data.get(key, "") == "":
+                    data.pop(key)
         for remove_data in remove_list:
             if remove_data in data_list:
                 data_list.remove(remove_data)
@@ -563,8 +565,11 @@ class DataExtraction:
         for data in data_list:
             performance_fee = data.get("performance_fee", None)
             if performance_fee is not None:
-                performance_fee = float(performance_fee)
-                if performance_fee > 3 and performance_fee % 2.5 == 0:
+                try:
+                    performance_fee = float(performance_fee)
+                    if performance_fee > 3 and performance_fee % 2.5 == 0:
+                        data.pop("performance_fee")
+                except:
                     data.pop("performance_fee")
         remove_list = []
         for data in data_list:
diff --git a/core/metrics.py b/core/metrics.py
index 1481f74..3ce6c41 100644
--- a/core/metrics.py
+++ b/core/metrics.py
@@ -378,19 +378,52 @@ class Metrics:
                     }
                     missing_error_data.append(error_data)
             else:
-                true_data.append(0)
-                pred_data.append(1)
-                error_data = {
-                    "doc_id": doc_id,
-                    "data_point": data_point,
-                    "page_index": pred_page_index,
-                    "pred_raw_name": pred_raw_name,
-                    "investment_type": pred_investment_type,
-                    "error_type": "raw name incorrect",
-                    "error_value": pred_raw_name,
-                    "correct_value": "",
-                }
-                missing_error_data.append(error_data)
+                # If data point is performance fees, and value is 0, 
+                # If exist ter record with same raw name and same page inde, 
+                # then it's correct
+                pred_value_num = None
+                try:
+                    pred_value_num = float(pred_data_point_value)
+                except:
+                    pass
+                if data_point == "performance_fee" and pred_value_num == 0:
+                    # get ter data with the same raw name from prediction_data
+                    ter_data_df = prediction_data[
+                        (prediction_data["datapoint"] == "ter") & 
+                        (prediction_data["simple_raw_name"] == pred_simple_raw_name) &
+                        (prediction_data["page_index"] == pred_page_index)
+                    ]
+                    if len(ter_data_df) > 0:
+                        true_data.append(1)
+                        pred_data.append(1)
+                    else:
+                        true_data.append(0)
+                        pred_data.append(1)
+                        error_data = {
+                            "doc_id": doc_id,
+                            "data_point": data_point,
+                            "page_index": pred_page_index,
+                            "pred_raw_name": pred_raw_name,
+                            "investment_type": pred_investment_type,
+                            "error_type": "raw name incorrect",
+                            "error_value": pred_raw_name,
+                            "correct_value": "",
+                        }
+                        missing_error_data.append(error_data)
+                else:   
+                    true_data.append(0)
+                    pred_data.append(1)
+                    error_data = {
+                        "doc_id": doc_id,
+                        "data_point": data_point,
+                        "page_index": pred_page_index,
+                        "pred_raw_name": pred_raw_name,
+                        "investment_type": pred_investment_type,
+                        "error_type": "raw name incorrect",
+                        "error_value": pred_raw_name,
+                        "correct_value": "",
+                    }
+                    missing_error_data.append(error_data)
 
         for index, ground_truth in dp_ground_truth.iterrows():
             gt_page_index = ground_truth["page_index"]
@@ -406,19 +439,51 @@ class Metrics:
             
             if gt_simple_name_unique_words not in pred_simple_name_unique_words_list and \
                 len(find_raw_name_in_pred) == 0:
-                true_data.append(1)
-                pred_data.append(0)
-                error_data = {
-                        "doc_id": doc_id,
-                        "data_point": data_point,
-                        "page_index": gt_page_index,
-                        "pred_raw_name": "",
-                        "investment_type": gt_investment_type,
-                        "error_type": "raw name missing",
-                        "error_value": "",
-                        "correct_value": gt_raw_name,
-                    }
-                missing_error_data.append(error_data)
+                gt_value_num = None
+                try:
+                    gt_value_num = float(gt_data_point_value)
+                except:
+                    pass
+                # If data point is performance fees, and value is 0, 
+                # If exist ter record with same raw name and same page inde, 
+                # then it's correct
+                if data_point == "performance_fee" and gt_value_num == 0:
+                    ter_data_df = ground_truth_data[
+                        (ground_truth_data["datapoint"] == "ter") & 
+                        (ground_truth_data["simple_raw_name"] == gt_simple_raw_name) &
+                        (ground_truth_data["page_index"] == gt_page_index)
+                    ]
+                    if len(ter_data_df) > 0:
+                        true_data.append(1)
+                        pred_data.append(1)
+                    else:
+                        true_data.append(1)
+                        pred_data.append(0)
+                        error_data = {
+                                "doc_id": doc_id,
+                                "data_point": data_point,
+                                "page_index": gt_page_index,
+                                "pred_raw_name": "",
+                                "investment_type": gt_investment_type,
+                                "error_type": "raw name missing",
+                                "error_value": "",
+                                "correct_value": gt_raw_name,
+                            }
+                        missing_error_data.append(error_data)
+                else:
+                    true_data.append(1)
+                    pred_data.append(0)
+                    error_data = {
+                            "doc_id": doc_id,
+                            "data_point": data_point,
+                            "page_index": gt_page_index,
+                            "pred_raw_name": "",
+                            "investment_type": gt_investment_type,
+                            "error_type": "raw name missing",
+                            "error_value": "",
+                            "correct_value": gt_raw_name,
+                        }
+                    missing_error_data.append(error_data)
 
         return true_data, pred_data, missing_error_data
     
diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json
index 728d6ff..e13b311 100644
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@@ -69,6 +69,7 @@
 			{
 				"title": "Latest data with time series data:",
 				"contents": [
+					"Simple case:",
 					"Some data table is with multiple date columns, please extract the data from the latest date column:",
 					"- Get dates from column header.",
 					"- Only extract data from the columns which column header is as the latest date.",
@@ -77,7 +78,15 @@
 					"performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom  \\n1 July  \\nFrom  \\n19 July  \\nFrom  \\n1 January   \\nFrom  \\n27 April  \\nFrom  \\n19 July  \\nFrom  \\n1 January \\n2021\\nFrom  \\n22 May \\n2021\\nFrom  \\n16 July \\n2021\\nFrom  \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance_fee\": 1.73}]}",
-					"The keywords are performance fees, the value 1.73 is the first number with the latest date-time."
+					"The keywords are performance fees, the value 1.73 is the first number with the latest date-time.",
+					"Complex case:",
+					"Some table with messy text as header, please extract the data from the first 1 - 2 data value columns:",
+					"Example context:",
+					"1RWHV WR WKH ILQDQFLDO VWDWHPHQWV Notes aux tats financiers\nLO Funds - 30/09/2023\n678 \n,6,1 &RGH \n6XE )XQGV \n6KDUH &ODVV \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \nCompartiments \nClasse \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de la \nComm. de \nPerformance \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de \nla Comm. de \nPerformance \n \f \n \f \n \f \n \f \n \f \n \f \n\b \n\b \n\b \n\b\n\b\n\b\nLU2376083999 \nTerreNeuve \nN A EUR SH X1 \n1.60 \n1.61 \n0.01 \n1.58 \n1.58 \n- \nLU1858044701 \nTerreNeuve \nN D GBP SH \n1.85 \n1.85 \n- \n1.84 \n1.86 \n- \n",
+					"Although the table is with messy text as header, but the latest date columns are the first 2 value columns, they are \"TER du Fonds\" and \"TER avec \nComm. de \nPerformance4\".",
+					"The TER value is from TER avec \nComm. de \nPerformance4, the performance fees value is from \"TER avec \nComm. de \nPerformance4\" - \"TER du Fonds\", e.g. 1.61 - 1.60 = 0.01, 1.85 - 1.85 = 0.",
+					"The output should be:",
+					"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}"
 				]
 			}
 		],
@@ -108,6 +117,7 @@
 			{
 				"title": "Performance fees is part of TER:",
 				"contents": [
+					"Common case:",
 					"If exist both of \"TER including performance fees\" and \"TER excluding performance fees\",",
 					"The TER should be \"TER including performance fees\".",
 					"The performance fees should be:",
@@ -116,7 +126,12 @@
 					"GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}",
-					"The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0"
+					"The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0",
+					"Sepcial case:",
+					"Attention: if some table is with three value columns: TER excluding performance fees, TER including performance fees, Performance fees, ",
+					"The Performance fees value in column: Performance fees, chould be \"-\", because of TER including performance fees - TER excluding performance fees = 0, ", 
+					"But it's incorrect, according to this issue, please still extract performance fees from TER including performance fees - TER excluding performance fees.",
+					"To make sure performance fees is with actual value."
 				]
 			}
 		]
diff --git a/main.py b/main.py
index 4db74d9..6a13a3c 100644
--- a/main.py
+++ b/main.py
@@ -577,7 +577,8 @@ if __name__ == "__main__":
     #              extract_way,
     #              re_run_extract_data)
     
-    special_doc_id_list = ["349679479"]
+    special_doc_id_list = ["505174428", "510326848", "349679479"]
+    # special_doc_id_list = ["505174428"]
     output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
     output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
     re_run_mapping_data = True