initial abbreviation configurations

This commit is contained in:
Blade He 2025-01-21 17:09:45 -06:00
parent b15d260a58
commit e2b9bcbdbc
2 changed files with 6 additions and 4 deletions

View File

@ -34,7 +34,8 @@ ABB_JSON = dict()
def get_abb_json(): def get_abb_json():
global ABB_JSON global ABB_JSON
with open("abbreviation_records.json", "r") as file: if len(ABB_JSON.keys()) == 0:
with open("./configuration/aus_prospectus/abbreviation_records.json", "r") as file:
# Load the JSON and convert keys to lowercase # Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()} ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
@ -42,6 +43,7 @@ def get_abbre_format_str(fundname):
"""Replaces abbreviations in a fund name with their expanded forms.""" """Replaces abbreviations in a fund name with their expanded forms."""
# Convert fund name to lowercase while matching # Convert fund name to lowercase while matching
f_list = fundname.lower().split() f_list = fundname.lower().split()
get_abb_json()
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list] updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
return " ".join(updated_doc_fname_words) return " ".join(updated_doc_fname_words)

View File

@ -1016,7 +1016,7 @@ def batch_run_documents(
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = False re_run_extract_data = False
re_run_mapping_data = False re_run_mapping_data = True
force_save_total_data = True force_save_total_data = True
calculate_metrics = False calculate_metrics = False