support more performance fee keywords

This commit is contained in:
Blade He 2025-01-06 13:14:20 -06:00
parent 65e752e25a
commit 9348e32caa
4 changed files with 17 additions and 6 deletions

View File

@ -398,6 +398,8 @@
"Performancegebühren", "Performancegebühren",
"Performancevergütung", "Performancevergütung",
"Anlageerfolgsprämie", "Anlageerfolgsprämie",
"Anlageerfolgs-prämie",
"Anlageerfolgs- prämie",
"TER in % (inkl.", "TER in % (inkl.",
"TER % (inkl.", "TER % (inkl.",
"TER in % (exkl.", "TER in % (exkl.",

View File

@ -223,6 +223,14 @@
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}", "{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0", "The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
"Example 4:",
"-----Example Start-----",
"Fonds \nTER % \n(einschließlich \nAnlageerfolgs- \nprämie) \nTER % \n(ohne \nAnlageerfolgs-\nprämie) \ndb x-trackers EUR Liquid Corporate 12.5 UCITS ETF \n \n \nKlasse 1C \n0,35 % \n0,35 %",
"-----Example End-----",
"The output should be:",
"{\"data\": [{\"fund name\": \"db x-trackers EUR Liquid Corporate 12.5 UCITS ETF\", \"share name\": \"Klasse 1C\", \"ter\": 0.35, \"performance_fee\": 0}]}",
"The performance fees value is TER % (einschließlich Anlageerfolgsprämie) - TER % (ohne Anlageerfolgsprämie) = 0,35 - 0,35 = 0",
"or TER % (einschließlich Anlageerfolgs- \nprämie) - TER % (ohne Anlageerfolgs- \nprämie) = 0,35 - 0,35 = 0",
"Case 2:", "Case 2:",
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ", "If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", "The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",

11
main.py
View File

@ -869,7 +869,7 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet) new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
def batch_run_documents(): def batch_run_documents(special_doc_id_list: list = None):
sample_document_list_folder = r'./sample_documents/' sample_document_list_folder = r'./sample_documents/'
document_list_files = glob(sample_document_list_folder + "*.txt") document_list_files = glob(sample_document_list_folder + "*.txt")
@ -887,8 +887,8 @@ def batch_run_documents():
calculate_metrics = False calculate_metrics = False
extract_way = "text" extract_way = "text"
special_doc_id_list = [] # special_doc_id_list = []
if len(special_doc_id_list) == 0: if special_doc_id_list is None or len(special_doc_id_list) == 0:
force_save_total_data = True force_save_total_data = True
# file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"] # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
file_base_name_candidates = ["sample_documents_12_11"] file_base_name_candidates = ["sample_documents_12_11"]
@ -1031,9 +1031,10 @@ if __name__ == "__main__":
data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx" data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx" document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx" output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx"
merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path) # merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
# batch_initial_document() # batch_initial_document()
# batch_run_documents() special_doc_id_list = ["553242411"]
batch_run_documents(special_doc_id_list)
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"

View File

@ -67,7 +67,7 @@ def add_slash_to_text_as_regex(text: str):
continue continue
replace = r"\{0}".format(special_iter.group()) replace = r"\{0}".format(special_iter.group())
if replace not in text: if replace not in text:
text = re.sub(replace, r"\\W", text) text = re.sub(replace, r"\\W*", text)
text = re.sub(r"( ){2,}", " ", text) text = re.sub(r"( ){2,}", " ", text)
text = text.replace(" ", r"\s*") text = text.replace(" ", r"\s*")
return text return text