initial abbreviation configurations

This commit is contained in:
Blade He 2025-01-21 17:09:45 -06:00
parent b15d260a58
commit e2b9bcbdbc
2 changed files with 6 additions and 4 deletions

View File

@ -34,14 +34,16 @@ ABB_JSON = dict()
def get_abb_json():
global ABB_JSON
with open("abbreviation_records.json", "r") as file:
# Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
if len(ABB_JSON.keys()) == 0:
with open("./configuration/aus_prospectus/abbreviation_records.json", "r") as file:
# Load the JSON and convert keys to lowercase
ABB_JSON = {key.lower(): value for key, value in json.load(file).items()}
def get_abbre_format_str(fundname):
"""Replaces abbreviations in a fund name with their expanded forms."""
# Convert fund name to lowercase while matching
f_list = fundname.lower().split()
get_abb_json()
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
return " ".join(updated_doc_fname_words)

View File

@ -1016,7 +1016,7 @@ def batch_run_documents(
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
)
re_run_extract_data = False
re_run_mapping_data = False
re_run_mapping_data = True
force_save_total_data = True
calculate_metrics = False