optimize instructions

optimize metrics algorithm
This commit is contained in:
Blade He 2024-09-20 16:46:44 -05:00
parent 91530d6089
commit 8496c7b5ed
5 changed files with 151 additions and 118 deletions

View File

@ -567,7 +567,8 @@ class DataExtraction:
if performance_fee is not None:
try:
performance_fee = float(performance_fee)
if performance_fee > 3 and performance_fee % 2.5 == 0:
if (performance_fee > 3 and performance_fee % 2.5 == 0) or \
performance_fee > 10:
data.pop("performance_fee")
except:
data.pop("performance_fee")

View File

@ -120,7 +120,9 @@ class Metrics:
prediction_doc_id_list = prediction_df["doc_id"].unique().tolist()
ground_truth_doc_id_list = ground_truth_df["doc_id"].unique().tolist()
# get intersection of doc_id_list
doc_id_list = list(set(prediction_doc_id_list) & set(ground_truth_doc_id_list))
doc_id_list = list(
set(prediction_doc_id_list) & set(ground_truth_doc_id_list)
)
# order by doc_id
doc_id_list.sort()
@ -296,14 +298,18 @@ class Metrics:
dp_prediction = prediction_data[prediction_data["datapoint"] == data_point]
dp_prediction = self.modify_data(dp_prediction)
pred_simple_raw_names = dp_prediction["simple_raw_name"].unique().tolist()
pred_simple_name_unique_words_list = dp_prediction["simple_name_unique_words"].unique().tolist()
pred_simple_name_unique_words_list = (
dp_prediction["simple_name_unique_words"].unique().tolist()
)
dp_ground_truth = ground_truth_data[
ground_truth_data["datapoint"] == data_point
]
dp_ground_truth = self.modify_data(dp_ground_truth)
gt_simple_raw_names = dp_ground_truth["simple_raw_name"].unique().tolist()
gt_simple_name_unique_words_list = dp_ground_truth["simple_name_unique_words"].unique().tolist()
gt_simple_name_unique_words_list = (
dp_ground_truth["simple_name_unique_words"].unique().tolist()
)
true_data = []
pred_data = []
@ -323,21 +329,36 @@ class Metrics:
pred_data_point_value = prediction["value"]
pred_investment_type = prediction["investment_type"]
find_raw_name_in_gt = [gt_raw_name for gt_raw_name in gt_simple_raw_names
if (gt_raw_name in pred_simple_raw_name or pred_simple_raw_name in gt_raw_name)
and gt_raw_name.endswith(pred_raw_name.split()[-1])]
if pred_simple_name_unique_words in gt_simple_name_unique_words_list or \
len(find_raw_name_in_gt) > 0:
find_raw_name_in_gt = [
gt_raw_name
for gt_raw_name in gt_simple_raw_names
if (
gt_raw_name in pred_simple_raw_name
or pred_simple_raw_name in gt_raw_name
)
and gt_raw_name.endswith(pred_simple_raw_name.split()[-1])
]
if (
pred_simple_name_unique_words in gt_simple_name_unique_words_list
or len(find_raw_name_in_gt) > 0
):
# get the ground truth data with the same unique words
if pred_simple_name_unique_words in gt_simple_name_unique_words_list:
gt_data_df = dp_ground_truth[
dp_ground_truth["simple_name_unique_words"] == pred_simple_name_unique_words
dp_ground_truth["simple_name_unique_words"]
== pred_simple_name_unique_words
]
if len(gt_data_df) > 1:
if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0:
if (
len(gt_data_df[gt_data_df["page_index"] == pred_page_index])
== 0
):
gt_data = gt_data_df.iloc[0]
else:
gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0]
gt_data = gt_data_df[
gt_data_df["page_index"] == pred_page_index
].iloc[0]
elif len(gt_data_df) == 1:
gt_data = gt_data_df.iloc[0]
else:
@ -347,10 +368,15 @@ class Metrics:
dp_ground_truth["simple_raw_name"] == find_raw_name_in_gt[0]
]
if len(gt_data_df) > 1:
if len(gt_data_df[gt_data_df["page_index"] == pred_page_index]) == 0:
if (
len(gt_data_df[gt_data_df["page_index"] == pred_page_index])
== 0
):
gt_data = gt_data_df.iloc[0]
else:
gt_data = gt_data_df[gt_data_df["page_index"] == pred_page_index].iloc[0]
gt_data = gt_data_df[
gt_data_df["page_index"] == pred_page_index
].iloc[0]
elif len(gt_data_df) == 1:
gt_data = gt_data_df.iloc[0]
else:
@ -359,8 +385,10 @@ class Metrics:
gt_data_point_value = None
else:
gt_data_point_value = gt_data["value"]
if gt_data_point_value is not None and \
pred_data_point_value == gt_data_point_value:
if (
gt_data_point_value is not None
and pred_data_point_value == gt_data_point_value
):
true_data.append(1)
pred_data.append(1)
else:
@ -379,7 +407,6 @@ class Metrics:
missing_error_data.append(error_data)
else:
# If data point is performance fees, and value is 0,
# If exist ter record with same raw name and same page inde,
# then it's correct
pred_value_num = None
try:
@ -387,29 +414,8 @@ class Metrics:
except:
pass
if data_point == "performance_fee" and pred_value_num == 0:
# get ter data with the same raw name from prediction_data
ter_data_df = prediction_data[
(prediction_data["datapoint"] == "ter") &
(prediction_data["simple_raw_name"] == pred_simple_raw_name) &
(prediction_data["page_index"] == pred_page_index)
]
if len(ter_data_df) > 0:
true_data.append(1)
pred_data.append(1)
else:
true_data.append(0)
pred_data.append(1)
error_data = {
"doc_id": doc_id,
"data_point": data_point,
"page_index": pred_page_index,
"pred_raw_name": pred_raw_name,
"investment_type": pred_investment_type,
"error_type": "raw name incorrect",
"error_value": pred_raw_name,
"correct_value": "",
}
missing_error_data.append(error_data)
true_data.append(1)
pred_data.append(1)
else:
true_data.append(0)
pred_data.append(1)
@ -433,56 +439,43 @@ class Metrics:
gt_data_point_value = ground_truth["value"]
gt_investment_type = ground_truth["investment_type"]
find_raw_name_in_pred = [pred_raw_name for pred_raw_name in pred_simple_raw_names
if (gt_simple_raw_name in pred_raw_name or pred_raw_name in gt_simple_raw_name)
and pred_raw_name.endswith(gt_raw_name.split()[-1])]
find_raw_name_in_pred = [
pred_raw_name
for pred_raw_name in pred_simple_raw_names
if (
gt_simple_raw_name in pred_raw_name
or pred_raw_name in gt_simple_raw_name
)
and pred_raw_name.endswith(gt_simple_raw_name.split()[-1])
]
if gt_simple_name_unique_words not in pred_simple_name_unique_words_list and \
len(find_raw_name_in_pred) == 0:
if (
gt_simple_name_unique_words not in pred_simple_name_unique_words_list
and len(find_raw_name_in_pred) == 0
):
gt_value_num = None
try:
gt_value_num = float(gt_data_point_value)
except:
pass
# If data point is performance fees, and value is 0,
# If exist ter record with same raw name and same page inde,
# then it's correct
if data_point == "performance_fee" and gt_value_num == 0:
ter_data_df = ground_truth_data[
(ground_truth_data["datapoint"] == "ter") &
(ground_truth_data["simple_raw_name"] == gt_simple_raw_name) &
(ground_truth_data["page_index"] == gt_page_index)
]
if len(ter_data_df) > 0:
true_data.append(1)
pred_data.append(1)
else:
true_data.append(1)
pred_data.append(0)
error_data = {
"doc_id": doc_id,
"data_point": data_point,
"page_index": gt_page_index,
"pred_raw_name": "",
"investment_type": gt_investment_type,
"error_type": "raw name missing",
"error_value": "",
"correct_value": gt_raw_name,
}
missing_error_data.append(error_data)
true_data.append(1)
pred_data.append(1)
else:
true_data.append(1)
pred_data.append(0)
error_data = {
"doc_id": doc_id,
"data_point": data_point,
"page_index": gt_page_index,
"pred_raw_name": "",
"investment_type": gt_investment_type,
"error_type": "raw name missing",
"error_value": "",
"correct_value": gt_raw_name,
}
"doc_id": doc_id,
"data_point": data_point,
"page_index": gt_page_index,
"pred_raw_name": "",
"investment_type": gt_investment_type,
"error_type": "raw name missing",
"error_value": "",
"correct_value": gt_raw_name,
}
missing_error_data.append(error_data)
return true_data, pred_data, missing_error_data
@ -496,15 +489,40 @@ class Metrics:
raw_name_list = page_data["raw_name"].unique().tolist()
beginning_common_words = get_beginning_common_words(raw_name_list)
for raw_name in raw_name_list:
if beginning_common_words is not None and len(beginning_common_words) > 0:
simple_raw_name = raw_name.replace(beginning_common_words, "").strip()
if (
beginning_common_words is not None
and len(beginning_common_words) > 0
):
simple_raw_name = raw_name.replace(
beginning_common_words, ""
).strip()
if len(simple_raw_name) == 0:
simple_raw_name = raw_name
else:
simple_raw_name = raw_name
temp_splits = [word for word in simple_raw_name.split()
if word.lower() not in ["class", "usd"]]
if len(temp_splits) > 0:
simple_raw_name = " ".join(
word
for word in simple_raw_name.split()
if word.lower() not in ["class"]
)
simple_raw_name_splits = simple_raw_name.split()
if len(simple_raw_name_splits) > 2 and \
simple_raw_name_splits[-1] == "USD":
simple_raw_name = " ".join(simple_raw_name_splits[:-1])
# set simple_raw_name which with the same page and same raw_name
data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name),
"simple_raw_name"] = simple_raw_name
data.loc[(data["page_index"] == pagex_index) & (data["raw_name"] == raw_name),
"simple_name_unique_words"] = get_unique_words_text(simple_raw_name)
data.loc[
(data["page_index"] == pagex_index)
& (data["raw_name"] == raw_name),
"simple_raw_name",
] = simple_raw_name
data.loc[
(data["page_index"] == pagex_index)
& (data["raw_name"] == raw_name),
"simple_name_unique_words",
] = get_unique_words_text(simple_raw_name)
return data
def get_specific_metrics(self, true_data: list, pred_data: list):

View File

@ -69,24 +69,30 @@
{
"title": "Latest data with time series data:",
"contents": [
"Simple case:",
"Case 1:",
"Some data table is with multiple date columns, please extract the data from the latest date column:",
"- Get dates from column header.",
"- Only extract data from the columns which column header is as the latest date.",
"The latest date-time column usually is the first datapoint value column.",
"Here is the example:",
"-----Example Start-----",
"performance fees\\nhistorical performance fees\\nhistorical performance fees\\nFrom \\n1 July \\nFrom \\n19 July \\nFrom \\n1 January \\nFrom \\n27 April \\nFrom \\n19 July \\nFrom \\n1 January \\n2021\\nFrom \\n22 May \\n2021\\nFrom \\n16 July \\n2021\\nFrom \\n21 September \\n2021\\nto 30 June 2023\\nto 31 December 2022\\nto 31 December 2021\\nAsia Total Return Fund Class I5 (CHF Hedged) Acc\\n1.73%\\n \\n-1.32%\\n \\n \\n 2.04%\\n \\n \\n \\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Asia Total Return Fund\", \"share name\": \"Class I5 (CHF Hedged) Acc\", \"performance_fee\": 1.73}]}",
"The keywords are performance fees, the value 1.73 is the first number with the latest date-time.",
"Complex case:",
"Case 2:",
"Some table with messy text as header, please extract the data from the first 1 - 2 data value columns:",
"Example context:",
"-----Example Start-----",
"1RWHV WR WKH ILQDQFLDO VWDWHPHQWV Notes aux tats financiers\nLO Funds - 30/09/2023\n678 \n,6,1 &RGH \n6XE )XQGV \n6KDUH &ODVV \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \n)XQG 7(5 \n7(5 ZLWK \n3HUIRUPDQFH \n)HH \f \n6KDUH RI \n3HUIRUPDQFH \n)HH \nCompartiments \nClasse \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de la \nComm. de \nPerformance \nTER du Fonds \nTER avec \nComm. de \nPerformance4) \nQuote part de \nla Comm. de \nPerformance \n \f \n \f \n \f \n \f \n \f \n \f \n\b \n\b \n\b \n\b\n\b\n\b\nLU2376083999 \nTerreNeuve \nN A EUR SH X1 \n1.60 \n1.61 \n0.01 \n1.58 \n1.58 \n- \nLU1858044701 \nTerreNeuve \nN D GBP SH \n1.85 \n1.85 \n- \n1.84 \n1.86 \n- \n",
"-----Example End-----",
"Although the table is with messy text as header, but the latest date columns are the first 2 value columns, they are \"TER du Fonds\" and \"TER avec \nComm. de \nPerformance4\".",
"The TER value is from TER avec \nComm. de \nPerformance4, the performance fees value is from \"TER avec \nComm. de \nPerformance4\" - \"TER du Fonds\", e.g. 1.61 - 1.60 = 0.01, 1.85 - 1.85 = 0.",
"The output should be:",
"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}"
"{\"data\": [{\"fund name\": \"TerreNeuve\", \"share name\": \"N A EUR SH X1\", \"ter\": 1.61, \"performance_fee\": 0.01}, {\"fund name\": \"TerreNeuve\", \"share name\": \"N D GBP SH\", \"ter\": 1.85, \"performance_fee\": 0}]}",
"Summary: \nIf there are several data value columns in the table, please extract the data from the latest date column(s).",
"If you are not sure which column is the latest date column, please extract the data from the first 1 - 2 data value columns."
]
}
],
@ -101,7 +107,9 @@
"- \"feeder fund share class\" and \"TER feeder\" values",
"- \"Master fund\" and \"TER Master\" values",
"Here is the example:",
"-----Example Start-----",
"Feeder fund (share class)\\nMaster fund\\nTER\\nFeeder\\nTER Master\\nTotal\\nGlobal Portfolio Solution DKK -\\nBalanced Class TI\\nDanske Invest SICAV Global Portfolio\\nSolution Balanced Class X\\n0.1475%\\n0.7025%\\n0.850%\\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"Global Portfolio Solution DKK\", \"share name\": \"Balanced Class TI\", \"ter\": 0.1475}, {\"fund name\": \"Danske Invest SICAV Global Portfolio Solution DKK\", \"share name\": \"Balanced Class X\", \"ter\": 0.7025}]}"
]
@ -117,17 +125,19 @@
{
"title": "Performance fees is part of TER:",
"contents": [
"Common case:",
"Case 1:",
"If exist both of \"TER including performance fees\" and \"TER excluding performance fees\",",
"The TER should be \"TER including performance fees\".",
"The performance fees should be:",
"TER including performance fees - TER excluding performance fees.",
"Here is the example:",
"-----Example Start-----",
"GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}",
"The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0",
"Sepcial case:",
"Case 2:",
"Attention: if some table is with three value columns: TER excluding performance fees, TER including performance fees, Performance fees, ",
"The Performance fees value in column: Performance fees, chould be \"-\", because of TER including performance fees - TER excluding performance fees = 0, ",
"But it's incorrect, according to this issue, please still extract performance fees from TER including performance fees - TER excluding performance fees.",
@ -146,8 +156,9 @@
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
"Example:",
"Context:",
"-----Example Start-----",
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
"-----Example End-----",
"Output:",
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}",
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",

39
main.py
View File

@ -522,8 +522,9 @@ def test_auto_generate_instructions():
def test_data_extraction_metrics():
data_type = "data_extraction"
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_20240919120502.xlsx"
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/509350496.xlsx"
prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_image_20240920033929.xlsx"
# prediction_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_88_documents_by_text_20240920153730.xlsx"
# prediction_file = r"/data/emea_ar/output/mapping_data/docs/by_text/excel/469138353.xlsx"
prediction_sheet_name = "mapping_data"
ground_truth_file = r"/data/emea_ar/ground_truth/data_extraction/mapping_data_info_73_documents.xlsx"
ground_truth_sheet_name = "mapping_data"
@ -577,27 +578,27 @@ if __name__ == "__main__":
# extract_way,
# re_run_extract_data)
special_doc_id_list = ["505174428", "510326848", "349679479"]
# special_doc_id_list = ["505174428"]
# special_doc_id_list = ["505174428", "510326848", "349679479"]
special_doc_id_list = []
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_mapping_data = True
force_save_total_data = False
extract_ways = ["text"]
for extract_way in extract_ways:
batch_start_job(
pdf_folder,
page_filter_ground_truth_file,
output_extract_data_child_folder,
output_mapping_child_folder,
output_extract_data_total_folder,
output_mapping_total_folder,
extract_way,
special_doc_id_list,
re_run_extract_data,
re_run_mapping_data,
force_save_total_data=force_save_total_data,
)
# for extract_way in extract_ways:
# batch_start_job(
# pdf_folder,
# page_filter_ground_truth_file,
# output_extract_data_child_folder,
# output_mapping_child_folder,
# output_extract_data_total_folder,
# output_mapping_total_folder,
# extract_way,
# special_doc_id_list,
# re_run_extract_data,
# re_run_mapping_data,
# force_save_total_data=force_save_total_data,
# )
# test_data_extraction_metrics()
test_data_extraction_metrics()

View File

@ -269,6 +269,8 @@ def replace_abbrevation(text: str):
text = re.sub(r'euro', 'EUR', text, flags=re.IGNORECASE)
elif '' in text.lower().split():
text = re.sub(r'\', 'EUR', text, flags=re.IGNORECASE)
elif 'RMB' in text.lower().split():
text = re.sub(r'RMB', 'CNY', text, flags=re.IGNORECASE)
else:
pass