optimize instructions

optimize metrics algorithm
2024-09-20 16:46:44 -05:00 · 2024-09-20 16:46:44 -05:00 · 8496c7b5ed
parent 91530d6089
commit 8496c7b5ed
5 changed files with 151 additions and 118 deletions
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -567,7 +567,8 @@ class DataExtraction:
            if performance_fee is not None:
                try:
                    performance_fee = float(performance_fee)
-                    if performance_fee > 3 and performance_fee % 2.5 == 0:
+                    if (performance_fee > 3 and performance_fee % 2.5 == 0) or \
+                        performance_fee > 10:
                        data.pop("performance_fee")
                except:
                    data.pop("performance_fee")
--- a/core/metrics.py
+++ b/core/metrics.py
@ -120,7 +120,9 @@ class Metrics:
            prediction_doc_id_list = prediction_df["doc_id"].unique().tolist()
            ground_truth_doc_id_list = ground_truth_df["doc_id"].unique().tolist()
            # get intersection of doc_id_list
-            doc_id_list = list(set(prediction_doc_id_list) & set(ground_truth_doc_id_list))
+            doc_id_list = list(
+                set(prediction_doc_id_list) & set(ground_truth_doc_id_list)
+            )
            # order by doc_id
            doc_id_list.sort()

@ -296,14 +298,18 @@ class Metrics:
        dp_prediction = prediction_data[prediction_data["datapoint"] == data_point]
        dp_prediction = self.modify_data(dp_prediction)
        pred_simple_raw_names = dp_prediction["simple_raw_name"].unique().tolist()
-        pred_simple_name_unique_words_list = dp_prediction["simple_name_unique_words"].unique().tolist()
+        pred_simple_name_unique_words_list = (
+            dp_prediction["simple_name_unique_words"].unique().tolist()
+        )

        dp_ground_truth = ground_truth_data[
            ground_truth_data["datapoint"] == data_point
        ]
        dp_ground_truth = self.modify_data(dp_ground_truth)
        gt_simple_raw_names = dp_ground_truth["simple_raw_name"].unique().tolist()
-        gt_simple_name_unique_words_list = dp_ground_truth["simple_name_unique_words"].unique().tolist()
+        gt_simple_name_unique_words_list = (
+            dp_ground_truth["simple_name_unique_words"].unique().tolist()
+        )

        true_data = []
        pred_data = []
@ -323,21 +329,36 @@ class Metrics:
            pred_data_point_value = prediction["value"]
            pred_investment_type = prediction["investment_type"]

-            find_raw_name_in_gt = [gt_raw_name for gt_raw_name in gt_simple_raw_names 
-                                   if (gt_raw_name in pred_simple_raw_name or pred_simple_raw_name in gt_raw_name)
-                                   and gt_raw_name.endswith(pred_raw_name.split()[-1])]
-            if pred_simple_name_unique_words in gt_simple_name_unique_words_list or \
-                len(find_raw_name_in_gt) > 0:
+            find_raw_name_in_gt = [
+                gt_raw_name
+                for gt_raw_name in gt_simple_raw_names
+                if (
+                    gt_raw_name in pred_simple_raw_name
+                    or pred_simple_raw_name in gt_raw_name
+                )
+                and gt_raw_name.endswith(pred_simple_raw_name.split()[-1])
+            ]
+
+            if (
+                pred_simple_name_unique_words in gt_simple_name_unique_words_list
+                or len(find_raw_name_in_gt) > 0
+            ):
                # get the ground truth data with the same unique words
                if pred_simple_name_unique_words in gt_simple_name_unique_words_list:
                    gt_data_df = dp_ground_truth[
-                        dp_ground_truth["simple_name_unique_words"] == pred_simple_name_unique_words
+                        dp_ground_truth["simple_name_unique_words"]
+                        == pred_simple_name_unique_words
                    ]
                    if len(gt_data_df) > 1:
-                        if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0:
+                        if (
+                            len(gt_data_df[gt_data_df["page_index"] == pred_page_index])
+                            == 0
+                        ):
                            gt_data = gt_data_df.iloc[0]
                        else:
-                            gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0]
+                            gt_data = gt_data_df[
+                                gt_data_df["page_index"] == pred_page_index
+                            ].iloc[0]
                    elif len(gt_data_df) == 1:
                        gt_data = gt_data_df.iloc[0]
                    else:
@ -347,10 +368,15 @@ class Metrics:
                        dp_ground_truth["simple_raw_name"] == find_raw_name_in_gt[0]
                    ]
                    if len(gt_data_df) > 1:
-                        if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0:
+                        if (
+                            len(gt_data_df[gt_data_df["page_index"] == pred_page_index])
+                            == 0
+                        ):
                            gt_data = gt_data_df.iloc[0]
                        else:
-                            gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0]
+                            gt_data = gt_data_df[
+                                gt_data_df["page_index"] == pred_page_index
+                            ].iloc[0]
                    elif len(gt_data_df) == 1:
                        gt_data = gt_data_df.iloc[0]
                    else:
@ -359,8 +385,10 @@ class Metrics:
                    gt_data_point_value = None
                else:
                    gt_data_point_value = gt_data["value"]
-                if gt_data_point_value is not None and \
-                    pred_data_point_value == gt_data_point_value:
+                if (
+                    gt_data_point_value is not None
+                    and pred_data_point_value == gt_data_point_value
+                ):
                    true_data.append(1)
                    pred_data.append(1)
                else:
@ -379,7 +407,6 @@ class Metrics:
                    missing_error_data.append(error_data)
            else:
                # If data point is performance fees, and value is 0,
-                # If exist ter record with same raw name and same page inde, 
                # then it's correct
                pred_value_num = None
                try:
@ -387,13 +414,6 @@ class Metrics:
                except:
                    pass
                if data_point == "performance_fee" and pred_value_num == 0:
-                    # get ter data with the same raw name from prediction_data
-                    ter_data_df = prediction_data[
-                        (prediction_data["datapoint"] == "ter") & 
-                        (prediction_data["simple_raw_name"] == pred_simple_raw_name) &
-                        (prediction_data["page_index"] == pred_page_index)
-                    ]
-                    if len(ter_data_df) > 0:
                    true_data.append(1)
                    pred_data.append(1)
                else:
@ -410,20 +430,6 @@ class Metrics:
                        "correct_value": "",
                    }
                    missing_error_data.append(error_data)
-                else:   
-                    true_data.append(0)
-                    pred_data.append(1)
-                    error_data = {
-                        "doc_id": doc_id,
-                        "data_point": data_point,
-                        "page_index": pred_page_index,
-                        "pred_raw_name": pred_raw_name,
-                        "investment_type": pred_investment_type,
-                        "error_type": "raw name incorrect",
-                        "error_value": pred_raw_name,
-                        "correct_value": "",
-                    }
-                    missing_error_data.append(error_data)

        for index, ground_truth in dp_ground_truth.iterrows():
            gt_page_index = ground_truth["page_index"]
@ -433,27 +439,28 @@ class Metrics:
            gt_data_point_value = ground_truth["value"]
            gt_investment_type = ground_truth["investment_type"]

-            find_raw_name_in_pred = [pred_raw_name for pred_raw_name in pred_simple_raw_names 
-                                   if (gt_simple_raw_name in pred_raw_name or pred_raw_name in gt_simple_raw_name)
-                                   and pred_raw_name.endswith(gt_raw_name.split()[-1])]
+            find_raw_name_in_pred = [
+                pred_raw_name
+                for pred_raw_name in pred_simple_raw_names
+                if (
+                    gt_simple_raw_name in pred_raw_name
+                    or pred_raw_name in gt_simple_raw_name
+                )
+                and pred_raw_name.endswith(gt_simple_raw_name.split()[-1])
+            ]

-            if gt_simple_name_unique_words not in pred_simple_name_unique_words_list and \
-                len(find_raw_name_in_pred) == 0:
+            if (
+                gt_simple_name_unique_words not in pred_simple_name_unique_words_list
+                and len(find_raw_name_in_pred) == 0
+            ):
                gt_value_num = None
                try:
                    gt_value_num = float(gt_data_point_value)
                except:
                    pass
                # If data point is performance fees, and value is 0,
-                # If exist ter record with same raw name and same page inde, 
                # then it's correct
                if data_point == "performance_fee" and gt_value_num == 0:
-                    ter_data_df = ground_truth_data[
-                        (ground_truth_data["datapoint"] == "ter") & 
-                        (ground_truth_data["simple_raw_name"] == gt_simple_raw_name) &
-                        (ground_truth_data["page_index"] == gt_page_index)
-                    ]
-                    if len(ter_data_df) > 0:
                    true_data.append(1)
                    pred_data.append(1)
                else:
@ -470,20 +477,6 @@ class Metrics:
                        "correct_value": gt_raw_name,
                    }
                    missing_error_data.append(error_data)
-                else:
-                    true_data.append(1)
-                    pred_data.append(0)
-                    error_data = {
-                            "doc_id": doc_id,
-                            "data_point": data_point,
-                            "page_index": gt_page_index,
-                            "pred_raw_name": "",
-                            "investment_type": gt_investment_type,
-                            "error_type": "raw name missing",
-                            "error_value": "",
-                            "correct_value": gt_raw_name,
-                        }
-                    missing_error_data.append(error_data)

        return true_data, pred_data, missing_error_data

@ -496,15 +489,40 @@ class Metrics:
            raw_name_list = page_data["raw_name"].unique().tolist()
            beginning_common_words = get_beginning_common_words(raw_name_list)
            for raw_name in raw_name_list:
-                if beginning_common_words is not None and len(beginning_common_words) > 0:
-                    simple_raw_name = raw_name.replace(beginning_common_words, "").strip()
+                if (
+                    beginning_common_words is not None
+                    and len(beginning_common_words) > 0
+                ):
+                    simple_raw_name = raw_name.replace(
+                        beginning_common_words, ""
+                    ).strip()
+                    if len(simple_raw_name) == 0:
+                        simple_raw_name = raw_name
                else:
                    simple_raw_name = raw_name
+                temp_splits = [word for word in simple_raw_name.split() 
+                               if word.lower() not in ["class", "usd"]]
+                if len(temp_splits) > 0:
+                    simple_raw_name = " ".join(
+                        word
+                        for word in simple_raw_name.split()
+                        if word.lower() not in ["class"]
+                    )
+                    simple_raw_name_splits = simple_raw_name.split()
+                    if len(simple_raw_name_splits) > 2 and \
+                        simple_raw_name_splits[-1] == "USD":
+                        simple_raw_name = " ".join(simple_raw_name_splits[:-1])
                # set simple_raw_name which with the same page and same raw_name
-                data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name), 
-                         "simple_raw_name"] = simple_raw_name
-                data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name),
-                            "simple_name_unique_words"] = get_unique_words_text(simple_raw_name)
+                data.loc[
+                    (data["page_index"] == pagex_index)
+                    & (data["raw_name"] == raw_name),
+                    "simple_raw_name",
+                ] = simple_raw_name
+                data.loc[
+                    (data["page_index"] == pagex_index)
+                    & (data["raw_name"] == raw_name),
+                    "simple_name_unique_words",
+                ] = get_unique_words_text(simple_raw_name)
        return data

    def get_specific_metrics(self, true_data: list, pred_data: list):
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -69,24 +69,30 @@
 			{
 				"title": "Latest data with time series data:",
 				"contents": [
-					"Simple case:",
+					"Case 1:",
 					"Some data table is with multiple date columns, please extract the data from the latest date column:",
 					"- Get dates from column header.",
 					"- Only extract data from the columns which column header is as the latest date.",
 					"The latest date-time column usually is the first datapoint value column.",
 					"Here is the example:",
+					"-----Example Start-----",
 					"performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom  \\n1 July  \\nFrom  \\n19 July  \\nFrom  \\n1 January   \\nFrom  \\n27 April  \\nFrom  \\n19 July  \\nFrom  \\n1 January \\n2021\\nFrom  \\n22 May \\n2021\\nFrom  \\n16 July \\n2021\\nFrom  \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n",
+					"-----Example End-----",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance_fee\": 1.73}]}",
 					"The keywords are performance fees, the value 1.73 is the first number with the latest date-time.",
-					"Complex case:",
+					"Case 2:",
 					"Some table with messy text as header, please extract the data from the first 1 - 2 data value columns:",
 					"Example context:",
+					"-----Example Start-----",
 					"1RWHV WR WKH ILQDQFLDO VWDWHPHQWV Notes aux tats financiers\nLO Funds - 30/09/2023\n678 \n,6,1 &RGH \n6XE )XQGV \n6KDUH &ODVV \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \nCompartiments \nClasse \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de la \nComm. de \nPerformance \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de \nla Comm. de \nPerformance \n \f \n \f \n \f \n \f \n \f \n \f \n\b \n\b \n\b \n\b\n\b\n\b\nLU2376083999 \nTerreNeuve \nN A EUR SH X1 \n1.60 \n1.61 \n0.01 \n1.58 \n1.58 \n- \nLU1858044701 \nTerreNeuve \nN D GBP SH \n1.85 \n1.85 \n- \n1.84 \n1.86 \n- \n",
+					"-----Example End-----",
 					"Although the table is with messy text as header, but the latest date columns are the first 2 value columns, they are \"TER du Fonds\" and \"TER avec \nComm. de \nPerformance4\".",
 					"The TER value is from TER avec \nComm. de \nPerformance4, the performance fees value is from \"TER avec \nComm. de \nPerformance4\" - \"TER du Fonds\", e.g. 1.61 - 1.60 = 0.01, 1.85 - 1.85 = 0.",
 					"The output should be:",
-					"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}"
+					"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}",
+					"Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).",
+					"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns."
 				]
 			}
 		],
@ -101,7 +107,9 @@
 					"- \"feeder fund share class\" and \"TER feeder\" values",
 					"- \"Master fund\" and \"TER Master\" values",
 					"Here is the example:",
+					"-----Example Start-----",
 					"Feeder fund (share class)\\nMaster fund\\nTER\\nFeeder\\nTER Master\\nTotal\\nGlobal Portfolio Solution DKK -\\nBalanced Class TI\\nDanske Invest SICAV Global Portfolio\\nSolution   Balanced Class X\\n0.1475%\\n0.7025%\\n0.850%\\n",
+					"-----Example End-----",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"Global Portfolio Solution DKK\", \"share name\": \"Balanced Class TI\", \"ter\": 0.1475}, {\"fund name\": \"Danske Invest SICAV Global Portfolio Solution DKK\", \"share name\": \"Balanced Class X\", \"ter\": 0.7025}]}"
 				]
@ -117,17 +125,19 @@
 			{
 				"title": "Performance fees is part of TER:",
 				"contents": [
-					"Common case:",
+					"Case 1:",
 					"If exist both of \"TER including performance fees\" and \"TER excluding performance fees\",",
 					"The TER should be \"TER including performance fees\".",
 					"The performance fees should be:",
 					"TER including performance fees - TER excluding performance fees.",
 					"Here is the example:",
+					"-----Example Start-----",
 					"GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n",
+					"-----Example End-----",
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}",
 					"The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0",
-					"Sepcial case:",
+					"Case 2:",
 					"Attention: if some table is with three value columns: TER excluding performance fees, TER including performance fees, Performance fees, ",
 					"The Performance fees value in column: Performance fees, chould be \"-\", because of TER including performance fees - TER excluding performance fees = 0, ", 
 					"But it's incorrect, according to this issue, please still extract performance fees from TER including performance fees - TER excluding performance fees.",
@ -146,8 +156,9 @@
 			"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
 			"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
 			"Example:",
-			"Context:",
+			"-----Example Start-----",
 			"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
+			"-----Example End-----",
 			"Output:",
 			"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}",
 			"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
--- a/main.py
+++ b/main.py
@ -522,8 +522,9 @@ def test_auto_generate_instructions():

 def test_data_extraction_metrics():
    data_type = "data_extraction"
-    prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_20240919120502.xlsx"
-    # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/509350496.xlsx"
+    prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
+    # prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240920153730.xlsx"
+    # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/469138353.xlsx"
    prediction_sheet_name = "mapping_data"
    ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
    ground_truth_sheet_name = "mapping_data"
@ -577,27 +578,27 @@ if __name__ == "__main__":
    #              extract_way,
    #              re_run_extract_data)
    
-    special_doc_id_list = ["505174428", "510326848", "349679479"]
-    # special_doc_id_list = ["505174428"]
+    # special_doc_id_list = ["505174428", "510326848", "349679479"]
+    special_doc_id_list = []
    output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_mapping_data = True
    force_save_total_data = False
    
    extract_ways = ["text"]
-    for extract_way in extract_ways:
-        batch_start_job(
-            pdf_folder,
-            page_filter_ground_truth_file,
-            output_extract_data_child_folder,
-            output_mapping_child_folder,
-            output_extract_data_total_folder,
-            output_mapping_total_folder,
-            extract_way,
-            special_doc_id_list,
-            re_run_extract_data,
-            re_run_mapping_data,
-            force_save_total_data=force_save_total_data,
-        )
+    # for extract_way in extract_ways:
+    #     batch_start_job(
+    #         pdf_folder,
+    #         page_filter_ground_truth_file,
+    #         output_extract_data_child_folder,
+    #         output_mapping_child_folder,
+    #         output_extract_data_total_folder,
+    #         output_mapping_total_folder,
+    #         extract_way,
+    #         special_doc_id_list,
+    #         re_run_extract_data,
+    #         re_run_mapping_data,
+    #         force_save_total_data=force_save_total_data,
+    #     )
    
-    # test_data_extraction_metrics()
+    test_data_extraction_metrics()
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -269,6 +269,8 @@ def replace_abbrevation(text: str):
        text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
    elif '€' in text.lower().split():
        text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE)
+    elif 'RMB' in text.lower().split():
+        text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
    else:
        pass