optimize prompts for TOR
This commit is contained in:
parent
a25991e2bb
commit
d79b05885d
|
|
@ -176,7 +176,7 @@
|
||||||
"Rotatie van de portefeuille",
|
"Rotatie van de portefeuille",
|
||||||
"POF",
|
"POF",
|
||||||
"Portefeuille omloop factor",
|
"Portefeuille omloop factor",
|
||||||
"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid"
|
"Taux de rotation - Omloopsnelheid"
|
||||||
],
|
],
|
||||||
"french": [
|
"french": [
|
||||||
"taux de rotation",
|
"taux de rotation",
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,8 @@
|
||||||
"special_rule": {
|
"special_rule": {
|
||||||
"tor": [
|
"tor": [
|
||||||
"If there are multiple TOR reported names, here is the priority rules:",
|
"If there are multiple TOR reported names, here is the priority rules:",
|
||||||
"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"."
|
"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation - Omloopsnelheid\".",
|
||||||
|
"- With \"Omloopsnelheid\" and \"Gecorrigeerde omloopsnelheid\", pick up the values from \"Omloopsnelheid\"."
|
||||||
],
|
],
|
||||||
"ter": [
|
"ter": [
|
||||||
"If there are multiple TER value columns, here is the priority rules:",
|
"If there are multiple TER value columns, here is the priority rules:",
|
||||||
|
|
|
||||||
9
main.py
9
main.py
|
|
@ -1122,7 +1122,6 @@ def batch_run_documents():
|
||||||
"407275419",
|
"407275419",
|
||||||
"337937633",
|
"337937633",
|
||||||
"337293427",
|
"337293427",
|
||||||
"334584772",
|
|
||||||
"404712928",
|
"404712928",
|
||||||
"451063582",
|
"451063582",
|
||||||
"451878128",
|
"451878128",
|
||||||
|
|
@ -1151,7 +1150,7 @@ def batch_run_documents():
|
||||||
"534535767"
|
"534535767"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
special_doc_id_list = ["422760156"]
|
# special_doc_id_list = ["407275419", "425595958", "451063582", "451878128"]
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
|
@ -1160,9 +1159,9 @@ def batch_run_documents():
|
||||||
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
|
output_extract_data_total_folder = r"/data/emea_ar/output/extract_data/total/"
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = False
|
||||||
force_save_total_data = False
|
force_save_total_data = True
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -62,9 +62,11 @@ def calculate_complex_document_metrics(verify_file_path: str, document_list: lis
|
||||||
document_count = len(document_list) \
|
document_count = len(document_list) \
|
||||||
if document_list is not None and len(document_list) > 0 \
|
if document_list is not None and len(document_list) > 0 \
|
||||||
else len(data_df["doc_id"].unique())
|
else len(data_df["doc_id"].unique())
|
||||||
|
|
||||||
|
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
|
||||||
|
|
||||||
output_metrics_file = os.path.join(output_folder,
|
output_metrics_file = os.path.join(output_folder,
|
||||||
f"complex_document_{document_count}_metrics.xlsx")
|
f"complex_{verify_file_name}_metrics.xlsx")
|
||||||
with pd.ExcelWriter(output_metrics_file) as writer:
|
with pd.ExcelWriter(output_metrics_file) as writer:
|
||||||
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
metrics_df.to_excel(writer, index=False, sheet_name="metrics")
|
||||||
|
|
||||||
|
|
@ -111,7 +113,7 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict:
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
file_folder = r"/data/emea_ar/ground_truth/data_extraction/verify/complex/"
|
||||||
verify_file = "mapping_data_info_31_documents_by_text_first_round.xlsx"
|
verify_file = "mapping_data_info_31_documents_by_text_second_round.xlsx"
|
||||||
verify_file_path = os.path.join(file_folder, verify_file)
|
verify_file_path = os.path.join(file_folder, verify_file)
|
||||||
document_list = [
|
document_list = [
|
||||||
"334584772",
|
"334584772",
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue