From 8496c7b5edcd1f808d1d570ac89a8095a992a646 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 20 Sep 2024 16:46:44 -0500 Subject: [PATCH] optimize instructions optimize metrics algorithm --- core/data_extraction.py | 3 +- core/metrics.py | 202 ++++++++++-------- .../data_extraction_prompts_config.json | 23 +- main.py | 39 ++-- utils/biz_utils.py | 2 + 5 files changed, 151 insertions(+), 118 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index ab4380d..fba6bb1 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -567,7 +567,8 @@ class DataExtraction: if performance_fee is not None: try: performance_fee = float(performance_fee) - if performance_fee > 3 and performance_fee % 2.5 == 0: + if (performance_fee > 3 and performance_fee % 2.5 == 0) or \ + performance_fee > 10: data.pop("performance_fee") except: data.pop("performance_fee") diff --git a/core/metrics.py b/core/metrics.py index 3ce6c41..1711d26 100644 --- a/core/metrics.py +++ b/core/metrics.py @@ -52,7 +52,7 @@ class Metrics: metrics_list = [ {"Data_Point": "NAN", "Precision": 0, "Recall": 0, "F1": 0, "Support": 0} ] - + missing_error_list, metrics_list = self.calculate_metrics() missing_error_df = pd.DataFrame(missing_error_list) @@ -120,7 +120,9 @@ class Metrics: prediction_doc_id_list = prediction_df["doc_id"].unique().tolist() ground_truth_doc_id_list = ground_truth_df["doc_id"].unique().tolist() # get intersection of doc_id_list - doc_id_list = list(set(prediction_doc_id_list) & set(ground_truth_doc_id_list)) + doc_id_list = list( + set(prediction_doc_id_list) & set(ground_truth_doc_id_list) + ) # order by doc_id doc_id_list.sort() @@ -296,15 +298,19 @@ class Metrics: dp_prediction = prediction_data[prediction_data["datapoint"] == data_point] dp_prediction = self.modify_data(dp_prediction) pred_simple_raw_names = dp_prediction["simple_raw_name"].unique().tolist() - pred_simple_name_unique_words_list = dp_prediction["simple_name_unique_words"].unique().tolist() - + pred_simple_name_unique_words_list = ( + dp_prediction["simple_name_unique_words"].unique().tolist() + ) + dp_ground_truth = ground_truth_data[ ground_truth_data["datapoint"] == data_point ] dp_ground_truth = self.modify_data(dp_ground_truth) gt_simple_raw_names = dp_ground_truth["simple_raw_name"].unique().tolist() - gt_simple_name_unique_words_list = dp_ground_truth["simple_name_unique_words"].unique().tolist() - + gt_simple_name_unique_words_list = ( + dp_ground_truth["simple_name_unique_words"].unique().tolist() + ) + true_data = [] pred_data = [] @@ -314,7 +320,7 @@ class Metrics: true_data.append(1) pred_data.append(1) return true_data, pred_data, missing_error_data - + for index, prediction in dp_prediction.iterrows(): pred_page_index = prediction["page_index"] pred_raw_name = prediction["raw_name"] @@ -323,21 +329,36 @@ class Metrics: pred_data_point_value = prediction["value"] pred_investment_type = prediction["investment_type"] - find_raw_name_in_gt = [gt_raw_name for gt_raw_name in gt_simple_raw_names - if (gt_raw_name in pred_simple_raw_name or pred_simple_raw_name in gt_raw_name) - and gt_raw_name.endswith(pred_raw_name.split()[-1])] - if pred_simple_name_unique_words in gt_simple_name_unique_words_list or \ - len(find_raw_name_in_gt) > 0: + find_raw_name_in_gt = [ + gt_raw_name + for gt_raw_name in gt_simple_raw_names + if ( + gt_raw_name in pred_simple_raw_name + or pred_simple_raw_name in gt_raw_name + ) + and gt_raw_name.endswith(pred_simple_raw_name.split()[-1]) + ] + + if ( + pred_simple_name_unique_words in gt_simple_name_unique_words_list + or len(find_raw_name_in_gt) > 0 + ): # get the ground truth data with the same unique words if pred_simple_name_unique_words in gt_simple_name_unique_words_list: gt_data_df = dp_ground_truth[ - dp_ground_truth["simple_name_unique_words"] == pred_simple_name_unique_words + dp_ground_truth["simple_name_unique_words"] + == pred_simple_name_unique_words ] if len(gt_data_df) > 1: - if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0: + if ( + len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) + == 0 + ): gt_data = gt_data_df.iloc[0] else: - gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0] + gt_data = gt_data_df[ + gt_data_df["page_index"] == pred_page_index + ].iloc[0] elif len(gt_data_df) == 1: gt_data = gt_data_df.iloc[0] else: @@ -347,10 +368,15 @@ class Metrics: dp_ground_truth["simple_raw_name"] == find_raw_name_in_gt[0] ] if len(gt_data_df) > 1: - if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0: + if ( + len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) + == 0 + ): gt_data = gt_data_df.iloc[0] else: - gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0] + gt_data = gt_data_df[ + gt_data_df["page_index"] == pred_page_index + ].iloc[0] elif len(gt_data_df) == 1: gt_data = gt_data_df.iloc[0] else: @@ -359,8 +385,10 @@ class Metrics: gt_data_point_value = None else: gt_data_point_value = gt_data["value"] - if gt_data_point_value is not None and \ - pred_data_point_value == gt_data_point_value: + if ( + gt_data_point_value is not None + and pred_data_point_value == gt_data_point_value + ): true_data.append(1) pred_data.append(1) else: @@ -378,8 +406,7 @@ class Metrics: } missing_error_data.append(error_data) else: - # If data point is performance fees, and value is 0, - # If exist ter record with same raw name and same page inde, + # If data point is performance fees, and value is 0, # then it's correct pred_value_num = None try: @@ -387,30 +414,9 @@ class Metrics: except: pass if data_point == "performance_fee" and pred_value_num == 0: - # get ter data with the same raw name from prediction_data - ter_data_df = prediction_data[ - (prediction_data["datapoint"] == "ter") & - (prediction_data["simple_raw_name"] == pred_simple_raw_name) & - (prediction_data["page_index"] == pred_page_index) - ] - if len(ter_data_df) > 0: - true_data.append(1) - pred_data.append(1) - else: - true_data.append(0) - pred_data.append(1) - error_data = { - "doc_id": doc_id, - "data_point": data_point, - "page_index": pred_page_index, - "pred_raw_name": pred_raw_name, - "investment_type": pred_investment_type, - "error_type": "raw name incorrect", - "error_value": pred_raw_name, - "correct_value": "", - } - missing_error_data.append(error_data) - else: + true_data.append(1) + pred_data.append(1) + else: true_data.append(0) pred_data.append(1) error_data = { @@ -432,61 +438,48 @@ class Metrics: gt_simple_name_unique_words = ground_truth["simple_name_unique_words"] gt_data_point_value = ground_truth["value"] gt_investment_type = ground_truth["investment_type"] - - find_raw_name_in_pred = [pred_raw_name for pred_raw_name in pred_simple_raw_names - if (gt_simple_raw_name in pred_raw_name or pred_raw_name in gt_simple_raw_name) - and pred_raw_name.endswith(gt_raw_name.split()[-1])] - - if gt_simple_name_unique_words not in pred_simple_name_unique_words_list and \ - len(find_raw_name_in_pred) == 0: + + find_raw_name_in_pred = [ + pred_raw_name + for pred_raw_name in pred_simple_raw_names + if ( + gt_simple_raw_name in pred_raw_name + or pred_raw_name in gt_simple_raw_name + ) + and pred_raw_name.endswith(gt_simple_raw_name.split()[-1]) + ] + + if ( + gt_simple_name_unique_words not in pred_simple_name_unique_words_list + and len(find_raw_name_in_pred) == 0 + ): gt_value_num = None try: gt_value_num = float(gt_data_point_value) except: pass - # If data point is performance fees, and value is 0, - # If exist ter record with same raw name and same page inde, + # If data point is performance fees, and value is 0, # then it's correct if data_point == "performance_fee" and gt_value_num == 0: - ter_data_df = ground_truth_data[ - (ground_truth_data["datapoint"] == "ter") & - (ground_truth_data["simple_raw_name"] == gt_simple_raw_name) & - (ground_truth_data["page_index"] == gt_page_index) - ] - if len(ter_data_df) > 0: - true_data.append(1) - pred_data.append(1) - else: - true_data.append(1) - pred_data.append(0) - error_data = { - "doc_id": doc_id, - "data_point": data_point, - "page_index": gt_page_index, - "pred_raw_name": "", - "investment_type": gt_investment_type, - "error_type": "raw name missing", - "error_value": "", - "correct_value": gt_raw_name, - } - missing_error_data.append(error_data) + true_data.append(1) + pred_data.append(1) else: true_data.append(1) pred_data.append(0) error_data = { - "doc_id": doc_id, - "data_point": data_point, - "page_index": gt_page_index, - "pred_raw_name": "", - "investment_type": gt_investment_type, - "error_type": "raw name missing", - "error_value": "", - "correct_value": gt_raw_name, - } + "doc_id": doc_id, + "data_point": data_point, + "page_index": gt_page_index, + "pred_raw_name": "", + "investment_type": gt_investment_type, + "error_type": "raw name missing", + "error_value": "", + "correct_value": gt_raw_name, + } missing_error_data.append(error_data) return true_data, pred_data, missing_error_data - + def modify_data(self, data: pd.DataFrame): data["simple_raw_name"] = "" data["simple_name_unique_words"] = "" @@ -496,15 +489,40 @@ class Metrics: raw_name_list = page_data["raw_name"].unique().tolist() beginning_common_words = get_beginning_common_words(raw_name_list) for raw_name in raw_name_list: - if beginning_common_words is not None and len(beginning_common_words) > 0: - simple_raw_name = raw_name.replace(beginning_common_words, "").strip() + if ( + beginning_common_words is not None + and len(beginning_common_words) > 0 + ): + simple_raw_name = raw_name.replace( + beginning_common_words, "" + ).strip() + if len(simple_raw_name) == 0: + simple_raw_name = raw_name else: simple_raw_name = raw_name + temp_splits = [word for word in simple_raw_name.split() + if word.lower() not in ["class", "usd"]] + if len(temp_splits) > 0: + simple_raw_name = " ".join( + word + for word in simple_raw_name.split() + if word.lower() not in ["class"] + ) + simple_raw_name_splits = simple_raw_name.split() + if len(simple_raw_name_splits) > 2 and \ + simple_raw_name_splits[-1] == "USD": + simple_raw_name = " ".join(simple_raw_name_splits[:-1]) # set simple_raw_name which with the same page and same raw_name - data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name), - "simple_raw_name"] = simple_raw_name - data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name), - "simple_name_unique_words"] = get_unique_words_text(simple_raw_name) + data.loc[ + (data["page_index"] == pagex_index) + & (data["raw_name"] == raw_name), + "simple_raw_name", + ] = simple_raw_name + data.loc[ + (data["page_index"] == pagex_index) + & (data["raw_name"] == raw_name), + "simple_name_unique_words", + ] = get_unique_words_text(simple_raw_name) return data def get_specific_metrics(self, true_data: list, pred_data: list): diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index e13b311..3e24a5d 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -69,24 +69,30 @@ { "title": "Latest data with time series data:", "contents": [ - "Simple case:", + "Case 1:", "Some data table is with multiple date columns, please extract the data from the latest date column:", "- Get dates from column header.", "- Only extract data from the columns which column header is as the latest date.", "The latest date-time column usually is the first datapoint value column.", "Here is the example:", + "-----Example Start-----", "performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom \\n1 July \\nFrom \\n19 July \\nFrom \\n1 January \\nFrom \\n27 April \\nFrom \\n19 July \\nFrom \\n1 January \\n2021\\nFrom \\n22 May \\n2021\\nFrom \\n16 July \\n2021\\nFrom \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n", + "-----Example End-----", "The output should be:", "{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance_fee\": 1.73}]}", "The keywords are performance fees, the value 1.73 is the first number with the latest date-time.", - "Complex case:", + "Case 2:", "Some table with messy text as header, please extract the data from the first 1 - 2 data value columns:", "Example context:", + "-----Example Start-----", "1RWHV WR WKH ILQDQFLDO VWDWHPHQWV Notes aux tats financiers\nLO Funds - 30/09/2023\n678 \n,6,1 &RGH \n6XE )XQGV \n6KDUH &ODVV \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \nCompartiments \nClasse \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de la \nComm. de \nPerformance \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de \nla Comm. de \nPerformance \n \f \n \f \n \f \n \f \n \f \n \f \n\b \n\b \n\b \n\b\n\b\n\b\nLU2376083999 \nTerreNeuve \nN A EUR SH X1 \n1.60 \n1.61 \n0.01 \n1.58 \n1.58 \n- \nLU1858044701 \nTerreNeuve \nN D GBP SH \n1.85 \n1.85 \n- \n1.84 \n1.86 \n- \n", + "-----Example End-----", "Although the table is with messy text as header, but the latest date columns are the first 2 value columns, they are \"TER du Fonds\" and \"TER avec \nComm. de \nPerformance4\".", "The TER value is from TER avec \nComm. de \nPerformance4, the performance fees value is from \"TER avec \nComm. de \nPerformance4\" - \"TER du Fonds\", e.g. 1.61 - 1.60 = 0.01, 1.85 - 1.85 = 0.", "The output should be:", - "{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}" + "{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}", + "Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).", + "If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns." ] } ], @@ -101,7 +107,9 @@ "- \"feeder fund share class\" and \"TER feeder\" values", "- \"Master fund\" and \"TER Master\" values", "Here is the example:", + "-----Example Start-----", "Feeder fund (share class)\\nMaster fund\\nTER\\nFeeder\\nTER Master\\nTotal\\nGlobal Portfolio Solution DKK -\\nBalanced Class TI\\nDanske Invest SICAV Global Portfolio\\nSolution Balanced Class X\\n0.1475%\\n0.7025%\\n0.850%\\n", + "-----Example End-----", "The output should be:", "{\"data\": [{\"fund name\": \"Global Portfolio Solution DKK\", \"share name\": \"Balanced Class TI\", \"ter\": 0.1475}, {\"fund name\": \"Danske Invest SICAV Global Portfolio Solution DKK\", \"share name\": \"Balanced Class X\", \"ter\": 0.7025}]}" ] @@ -117,17 +125,19 @@ { "title": "Performance fees is part of TER:", "contents": [ - "Common case:", + "Case 1:", "If exist both of \"TER including performance fees\" and \"TER excluding performance fees\",", "The TER should be \"TER including performance fees\".", "The performance fees should be:", "TER including performance fees - TER excluding performance fees.", "Here is the example:", + "-----Example Start-----", "GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n", + "-----Example End-----", "The output should be:", "{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}", "The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0", - "Sepcial case:", + "Case 2:", "Attention: if some table is with three value columns: TER excluding performance fees, TER including performance fees, Performance fees, ", "The Performance fees value in column: Performance fees, chould be \"-\", because of TER including performance fees - TER excluding performance fees = 0, ", "But it's incorrect, according to this issue, please still extract performance fees from TER including performance fees - TER excluding performance fees.", @@ -146,8 +156,9 @@ "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", "Example:", - "Context:", + "-----Example Start-----", "Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n", + "-----Example End-----", "Output:", "{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}", "Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.", diff --git a/main.py b/main.py index 6a13a3c..92ded42 100644 --- a/main.py +++ b/main.py @@ -522,8 +522,9 @@ def test_auto_generate_instructions(): def test_data_extraction_metrics(): data_type = "data_extraction" - prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_20240919120502.xlsx" - # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/509350496.xlsx" + prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx" + # prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240920153730.xlsx" + # prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/469138353.xlsx" prediction_sheet_name = "mapping_data" ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx" ground_truth_sheet_name = "mapping_data" @@ -577,27 +578,27 @@ if __name__ == "__main__": # extract_way, # re_run_extract_data) - special_doc_id_list = ["505174428", "510326848", "349679479"] - # special_doc_id_list = ["505174428"] + # special_doc_id_list = ["505174428", "510326848", "349679479"] + special_doc_id_list = [] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_mapping_data = True force_save_total_data = False extract_ways = ["text"] - for extract_way in extract_ways: - batch_start_job( - pdf_folder, - page_filter_ground_truth_file, - output_extract_data_child_folder, - output_mapping_child_folder, - output_extract_data_total_folder, - output_mapping_total_folder, - extract_way, - special_doc_id_list, - re_run_extract_data, - re_run_mapping_data, - force_save_total_data=force_save_total_data, - ) + # for extract_way in extract_ways: + # batch_start_job( + # pdf_folder, + # page_filter_ground_truth_file, + # output_extract_data_child_folder, + # output_mapping_child_folder, + # output_extract_data_total_folder, + # output_mapping_total_folder, + # extract_way, + # special_doc_id_list, + # re_run_extract_data, + # re_run_mapping_data, + # force_save_total_data=force_save_total_data, + # ) - # test_data_extraction_metrics() + test_data_extraction_metrics() diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 571f265..9b3a782 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -269,6 +269,8 @@ def replace_abbrevation(text: str): text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE) elif '€' in text.lower().split(): text = re.sub(r'\€', 'EUR', text, flags=re.IGNORECASE) + elif 'RMB' in text.lower().split(): + text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE) else: pass