diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index b765dc9..8da1851 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -398,6 +398,8 @@ "Performancegebühren", "Performancevergütung", "Anlageerfolgsprämie", + "Anlageerfolgs-prämie", + "Anlageerfolgs- prämie", "TER in % (inkl.", "TER % (inkl.", "TER in % (exkl.", diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index b7d37fb..41f4da6 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -223,6 +223,14 @@ "The output should be:", "{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}", "The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0", + "Example 4:", + "-----Example Start-----", + "Fonds \nTER % \n(einschließlich \nAnlageerfolgs- \nprämie) \nTER % \n(ohne \nAnlageerfolgs-\nprämie) \ndb x-trackers EUR Liquid Corporate 12.5 UCITS ETF \n \n \nKlasse 1C \n0,35 % \n0,35 %", + "-----Example End-----", + "The output should be:", + "{\"data\": [{\"fund name\": \"db x-trackers EUR Liquid Corporate 12.5 UCITS ETF\", \"share name\": \"Klasse 1C\", \"ter\": 0.35, \"performance_fee\": 0}]}", + "The performance fees value is TER % (einschließlich Anlageerfolgsprämie) - TER % (ohne Anlageerfolgsprämie) = 0,35 - 0,35 = 0", + "or TER % (einschließlich Anlageerfolgs- \nprämie) - TER % (ohne Anlageerfolgs- \nprämie) = 0,35 - 0,35 = 0", "Case 2:", "If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ", "The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", diff --git a/main.py b/main.py index f7867e1..2e5b910 100644 --- a/main.py +++ b/main.py @@ -869,7 +869,7 @@ def replace_rerun_data(new_data_file: str, original_data_file: str): new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet) -def batch_run_documents(): +def batch_run_documents(special_doc_id_list: list = None): sample_document_list_folder = r'./sample_documents/' document_list_files = glob(sample_document_list_folder + "*.txt") @@ -887,8 +887,8 @@ def batch_run_documents(): calculate_metrics = False extract_way = "text" - special_doc_id_list = [] - if len(special_doc_id_list) == 0: + # special_doc_id_list = [] + if special_doc_id_list is None or len(special_doc_id_list) == 0: force_save_total_data = True # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"] file_base_name_candidates = ["sample_documents_12_11"] @@ -1031,9 +1031,10 @@ if __name__ == "__main__": data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx" document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx" output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx" - merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path) + # merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path) # batch_initial_document() - # batch_run_documents() + special_doc_id_list = ["553242411"] + batch_run_documents(special_doc_id_list) # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx" # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx" diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 59248a3..ade84a3 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -67,7 +67,7 @@ def add_slash_to_text_as_regex(text: str): continue replace = r"\{0}".format(special_iter.group()) if replace not in text: - text = re.sub(replace, r"\\W", text) + text = re.sub(replace, r"\\W*", text) text = re.sub(r"( ){2,}", " ", text) text = text.replace(" ", r"\s*") return text