support more performance fee keywords

2025-01-06 13:14:20 -06:00 · 2025-01-06 13:14:20 -06:00 · 9348e32caa
parent 65e752e25a
commit 9348e32caa
4 changed files with 17 additions and 6 deletions
--- a/configuration/datapoint_keyword.json
+++ b/configuration/datapoint_keyword.json
@ -398,6 +398,8 @@
      "Performancegebühren",
      "Performancevergütung",
      "Anlageerfolgsprämie",
+      "Anlageerfolgs-prämie",
+      "Anlageerfolgs- prämie",
      "TER in % (inkl.",
      "TER % (inkl.",
      "TER in % (exkl.",
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -223,6 +223,14 @@
 					"The output should be:",
 					"{\"data\": [{\"fund name\": \"PIANO 400 Fund\", \"ter\": 0.58, \"performance_fee\": 0}]}",
 					"The performance fees value is TER % (inkl. Anlageerfolgsprämie) - TER % (exkl. Anlageerfolgsprämie) = 0,58 - 0,58 = 0",
+					"Example 4:",
+					"-----Example Start-----",
+					"Fonds \nTER % \n(einschließlich \nAnlageerfolgs- \nprämie) \nTER % \n(ohne \nAnlageerfolgs-\nprämie) \ndb x-trackers EUR Liquid Corporate 12.5 UCITS ETF \n \n \nKlasse 1C \n0,35 % \n0,35 %",
+					"-----Example End-----",
+					"The output should be:",
+					"{\"data\": [{\"fund name\": \"db x-trackers EUR Liquid Corporate 12.5 UCITS ETF\", \"share name\": \"Klasse 1C\", \"ter\": 0.35, \"performance_fee\": 0}]}",
+					"The performance fees value is TER % (einschließlich Anlageerfolgsprämie) - TER % (ohne Anlageerfolgsprämie) = 0,35 - 0,35 = 0",
+					"or TER % (einschließlich Anlageerfolgs- \nprämie) - TER % (ohne Anlageerfolgs- \nprämie) = 0,35 - 0,35 = 0",
 					"Case 2:",
 					"If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ",
 					"The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", 
--- a/main.py
+++ b/main.py
@ -869,7 +869,7 @@ def replace_rerun_data(new_data_file: str, original_data_file: str):
        new_extract_data.to_excel(writer, index=False, sheet_name=extract_data_sheet)


-def batch_run_documents():
+def batch_run_documents(special_doc_id_list: list = None):
    sample_document_list_folder = r'./sample_documents/'
    document_list_files = glob(sample_document_list_folder + "*.txt")
    
@ -887,8 +887,8 @@ def batch_run_documents():
    calculate_metrics = False

    extract_way = "text"
-    special_doc_id_list = []
-    if len(special_doc_id_list) == 0:
+    # special_doc_id_list = []
+    if special_doc_id_list is None or len(special_doc_id_list) == 0:
        force_save_total_data = True
        # file_base_name_candidates = ["sample_document_complex", "emea_case_from_word_complex"]
        file_base_name_candidates = ["sample_documents_12_11"]
@ -1031,9 +1031,10 @@ if __name__ == "__main__":
    data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_mapping_data_info_44_documents_by_text_20241211185546.xlsx"
    document_mapping_file_path = r"/data/emea_ar/basic_information/sample_documents/sample_doc/sample_documents_12_11/doc_mapping_12_11.xlsx"
    output_data_file_path = r"/data/emea_ar/ground_truth/data_extraction/verify/20241211/sample_documents_12_11_merged_data_info.xlsx"
-    merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
+    # merge_output_data(data_file_path, document_mapping_file_path, output_data_file_path)
    # batch_initial_document()
-    # batch_run_documents()
+    special_doc_id_list = ["553242411"]
+    batch_run_documents(special_doc_id_list)
    
    # new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
    # original_data_file = r"/data/emea_ar/ground_truth/data_extraction/verify/mapping_data_info_30_documents_all_4_datapoints_20241106_verify_mapping.xlsx"
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -67,7 +67,7 @@ def add_slash_to_text_as_regex(text: str):
            continue
        replace = r"\{0}".format(special_iter.group())
        if replace not in text:
-            text = re.sub(replace, r"\\W", text)
+            text = re.sub(replace, r"\\W*", text)
    text = re.sub(r"( ){2,}", " ", text)
    text = text.replace(" ", r"\s*")
    return text