From e2b9bcbdbca5f1757b84c128130fbaf80a070764 Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 21 Jan 2025 17:09:45 -0600 Subject: [PATCH] initial abbreviation configurations --- core/auz_nz/hybrid_solution_script.py | 8 +++++--- main.py | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/core/auz_nz/hybrid_solution_script.py b/core/auz_nz/hybrid_solution_script.py index 8f316fd..1ec5c93 100644 --- a/core/auz_nz/hybrid_solution_script.py +++ b/core/auz_nz/hybrid_solution_script.py @@ -34,14 +34,16 @@ ABB_JSON = dict() def get_abb_json(): global ABB_JSON - with open("abbreviation_records.json", "r") as file: - # Load the JSON and convert keys to lowercase - ABB_JSON = {key.lower(): value for key, value in json.load(file).items()} + if len(ABB_JSON.keys()) == 0: + with open("./configuration/aus_prospectus/abbreviation_records.json", "r") as file: + # Load the JSON and convert keys to lowercase + ABB_JSON = {key.lower(): value for key, value in json.load(file).items()} def get_abbre_format_str(fundname): """Replaces abbreviations in a fund name with their expanded forms.""" # Convert fund name to lowercase while matching f_list = fundname.lower().split() + get_abb_json() updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list] return " ".join(updated_doc_fname_words) diff --git a/main.py b/main.py index b07ef3f..2052e49 100644 --- a/main.py +++ b/main.py @@ -1016,7 +1016,7 @@ def batch_run_documents( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" ) re_run_extract_data = False - re_run_mapping_data = False + re_run_mapping_data = True force_save_total_data = True calculate_metrics = False