initial abbreviation configurations

2025-01-21 17:09:45 -06:00 · 2025-01-21 17:09:45 -06:00 · e2b9bcbdbc
parent b15d260a58
commit e2b9bcbdbc
2 changed files with 6 additions and 4 deletions
--- a/core/auz_nz/hybrid_solution_script.py
+++ b/core/auz_nz/hybrid_solution_script.py
@ -34,14 +34,16 @@ ABB_JSON = dict()

 def get_abb_json():
    global ABB_JSON
-    with open("abbreviation_records.json", "r") as file:
-        # Load the JSON and convert keys to lowercase
-        ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
+    if len(ABB_JSON.keys()) == 0:
+        with open("./configuration/aus_prospectus/abbreviation_records.json", "r") as file:
+            # Load the JSON and convert keys to lowercase
+            ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}

 def get_abbre_format_str(fundname):
    """Replaces abbreviations in a fund name with their expanded forms."""
    # Convert fund name to lowercase while matching
    f_list = fundname.lower().split()
+    get_abb_json()
    updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
    return " ".join(updated_doc_fname_words)

--- a/main.py
+++ b/main.py
@ -1016,7 +1016,7 @@ def batch_run_documents(
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
    )
    re_run_extract_data = False
-    re_run_mapping_data = False
+    re_run_mapping_data = True
    force_save_total_data = True
    calculate_metrics = False