support more performance fee keywords
This commit is contained in:
parent
65e752e25a
commit
9348e32caa
|
|
@ -398,6 +398,8 @@
|
||||||
"Performancegebühren",
|
"Performancegebühren",
|
||||||
"Performancevergütung",
|
"Performancevergütung",
|
||||||
"Anlageerfolgsprämie",
|
"Anlageerfolgsprämie",
|
||||||
|
"Anlageerfolgs-prämie",
|
||||||
|
"Anlageerfolgs- prämie",
|
||||||
"TER in % (inkl.",
|
"TER in % (inkl.",
|
||||||
"TER % (inkl.",
|
"TER % (inkl.",
|
||||||
"TER in % (exkl.",
|
"TER in % (exkl.",
|
||||||
|
|
|
||||||
|
|
@ -223,6 +223,14 @@
|
||||||
"The output should be:",
|
"The output should be:",
|
||||||
"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
|
"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
|
||||||
"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
|
"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
|
||||||
|
"Example 4:",
|
||||||
|
"-----Example Start-----",
|
||||||
|
"Fonds \nTER % \n(einschließlich \nAnlageerfolgs- \nprämie) \nTER % \n(ohne \nAnlageerfolgs-\nprämie) \ndb x-trackers EUR Liquid Corporate 12.5 UCITS ETF \n \n \nKlasse 1C \n0,35 % \n0,35 %",
|
||||||
|
"-----Example End-----",
|
||||||
|
"The output should be:",
|
||||||
|
"{\"data\": [{\"fund name\": \"db x-trackers EUR Liquid Corporate 12.5 UCITS ETF\", \"share name\": \"Klasse 1C\", \"ter\": 0.35, \"performance_fee\": 0}]}",
|
||||||
|
"The performance fees value is TER % (einschließlich Anlageerfolgsprämie) - TER % (ohne Anlageerfolgsprämie) = 0,35 - 0,35 = 0",
|
||||||
|
"or TER % (einschließlich Anlageerfolgs- \nprämie) - TER % (ohne Anlageerfolgs- \nprämie) = 0,35 - 0,35 = 0",
|
||||||
"Case 2:",
|
"Case 2:",
|
||||||
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
|
"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
|
||||||
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",
|
"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ",
|
||||||
|
|
|
||||||
11
main.py
11
main.py
|
|
@ -869,7 +869,7 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
|
||||||
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
|
new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)
|
||||||
|
|
||||||
|
|
||||||
def batch_run_documents():
|
def batch_run_documents(special_doc_id_list: list = None):
|
||||||
sample_document_list_folder = r'./sample_documents/'
|
sample_document_list_folder = r'./sample_documents/'
|
||||||
document_list_files = glob(sample_document_list_folder + "*.txt")
|
document_list_files = glob(sample_document_list_folder + "*.txt")
|
||||||
|
|
||||||
|
|
@ -887,8 +887,8 @@ def batch_run_documents():
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_way = "text"
|
extract_way = "text"
|
||||||
special_doc_id_list = []
|
# special_doc_id_list = []
|
||||||
if len(special_doc_id_list) == 0:
|
if special_doc_id_list is None or len(special_doc_id_list) == 0:
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
# file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
|
# file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
|
||||||
file_base_name_candidates = ["sample_documents_12_11"]
|
file_base_name_candidates = ["sample_documents_12_11"]
|
||||||
|
|
@ -1031,9 +1031,10 @@ if __name__ == "__main__":
|
||||||
data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
|
data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
|
||||||
document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
|
||||||
output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx"
|
output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx"
|
||||||
merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
|
# merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
|
||||||
# batch_initial_document()
|
# batch_initial_document()
|
||||||
# batch_run_documents()
|
special_doc_id_list = ["553242411"]
|
||||||
|
batch_run_documents(special_doc_id_list)
|
||||||
|
|
||||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||||
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
# original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
|
||||||
|
|
|
||||||
|
|
@ -67,7 +67,7 @@ def add_slash_to_text_as_regex(text: str):
|
||||||
continue
|
continue
|
||||||
replace = r"\{0}".format(special_iter.group())
|
replace = r"\{0}".format(special_iter.group())
|
||||||
if replace not in text:
|
if replace not in text:
|
||||||
text = re.sub(replace, r"\\W", text)
|
text = re.sub(replace, r"\\W*", text)
|
||||||
text = re.sub(r"( ){2,}", " ", text)
|
text = re.sub(r"( ){2,}", " ", text)
|
||||||
text = text.replace(" ", r"\s*")
|
text = text.replace(" ", r"\s*")
|
||||||
return text
|
return text
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue