From d79b05885d7a5f71983b8f24dd414bd5795e91b8 Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 6 Dec 2024 14:50:34 -0600 Subject: [PATCH] optimize prompts for TOR --- configuration/datapoint_reported_name.json | 2 +- instructions/data_extraction_prompts_config.json | 3 ++- main.py | 9 ++++----- specific_calc_metrics.py | 8 +++++--- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json index 9e115b1..e1b0342 100644 --- a/configuration/datapoint_reported_name.json +++ b/configuration/datapoint_reported_name.json @@ -176,7 +176,7 @@ "Rotatie van de portefeuille", "POF", "Portefeuille omloop factor", - "Taux de rotation corrigé - Gecorrigeerde omloopsnelheid" + "Taux de rotation - Omloopsnelheid" ], "french": [ "taux de rotation", diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index b341774..c2a676a 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -70,7 +70,8 @@ "special_rule": { "tor": [ "If there are multiple TOR reported names, here is the priority rules:", - "- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"." + "- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation - Omloopsnelheid\".", + "- With \"Omloopsnelheid\" and \"Gecorrigeerde omloopsnelheid\", pick up the values from \"Omloopsnelheid\"." ], "ter": [ "If there are multiple TER value columns, here is the priority rules:", diff --git a/main.py b/main.py index 3e2e095..d965cfb 100644 --- a/main.py +++ b/main.py @@ -1122,7 +1122,6 @@ def batch_run_documents(): "407275419", "337937633", "337293427", - "334584772", "404712928", "451063582", "451878128", @@ -1151,7 +1150,7 @@ def batch_run_documents(): "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["422760156"] + # special_doc_id_list = ["407275419", "425595958", "451063582", "451878128"] pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" @@ -1160,9 +1159,9 @@ def batch_run_documents(): output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = True - re_run_mapping_data = True - force_save_total_data = False + re_run_extract_data = False + re_run_mapping_data = False + force_save_total_data = True calculate_metrics = False extract_ways = ["text"] diff --git a/specific_calc_metrics.py b/specific_calc_metrics.py index 735fc71..620de34 100644 --- a/specific_calc_metrics.py +++ b/specific_calc_metrics.py @@ -62,9 +62,11 @@ def calculate_complex_document_metrics(verify_file_path: str, document_list: lis document_count = len(document_list) \ if document_list is not None and len(document_list) > 0 \ else len(data_df["doc_id"].unique()) - + + verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") + output_metrics_file = os.path.join(output_folder, - f"complex_document_{document_count}_metrics.xlsx") + f"complex_{verify_file_name}_metrics.xlsx") with pd.ExcelWriter(output_metrics_file) as writer: metrics_df.to_excel(writer, index=False, sheet_name="metrics") @@ -111,7 +113,7 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict: if __name__ == "__main__": file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/" - verify_file = "mapping_data_info_31_documents_by_text_first_round.xlsx" + verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx" verify_file_path = os.path.join(file_folder, verify_file) document_list = [ "334584772",