diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index 50fcac3..9e934d3 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -297,6 +297,7 @@ "On Going Charges", "OC", "Ongoing Charge Figure OCF", + "OCF Cap Rate", "Ongoing Fund Charge", "Operating Charge", "Operating Charges", diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 1d40a0e..7a1e30b 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -15,7 +15,7 @@ ], "reported_name": { "tor": "The TOR reported name could be:\nTOR, Turnover Ratio, Portfolio Turnover, Portfolio turnover ratio, PTR, Taux de rotation corrigé - Gecorrigeerde omloopsnelheid, etc.", - "ogc": "The OGC reported name could be:\nOGC, OGF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, etc.", + "ogc": "The OGC reported name could be:\nOGC, OGF, OCF, Ongoing Charge, Operation Charge, Ongoing charges in per cent, Ongoing charges in percent, Ongoing charges as a percentage, On Going Charges, Operating Charge, Ongoing Fund Charge, OCF Cap Rate, Ongoing Charges Figure, etc.", "ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, Annualisierte TER in % (Mit Gebührenverzicht), Annualised TER % (with fee waiver), etc.", "performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc." }, diff --git a/main.py b/main.py index 35c7ad4..146b045 100644 --- a/main.py +++ b/main.py @@ -809,6 +809,18 @@ if __name__ == "__main__": ] # documents in New EMEA Documents sample.xlsx as typical documents + # """ + # Below 9 documents can't get data by keywords or ChatGPT + # 526747539, + # 534112077, + # 535798742, + # 536299372, + # 539566148, + # 541343431, + # 541923319, + # 543243585, + # 543243654 + # """ check_db_mapping_doc_id_list = [ "511052670", "520733219", @@ -842,12 +854,12 @@ if __name__ == "__main__": ] # special_doc_id_list = check_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["434902020", "467595142", "528826768"] + special_doc_id_list = ["536299372"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = True + re_run_extract_data = False re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"] diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 587520c..c89d600 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -792,11 +792,11 @@ def replace_abbrevation(text: str): text_splits = text.split() new_text_splits = [] for split in text_splits: - if split.lower() in ['acc', 'acc.']: + if split.lower() in ['acc', 'acc.', 'accumulating']: new_text_splits.append('Accumulation') elif split.lower() in ['inc', 'inc.']: new_text_splits.append('Income') - elif split.lower() in ['dist', 'dist.', 'dis', 'dis.']: + elif split.lower() in ['dist', 'dist.', 'dis', 'dis.', "distributing"]: new_text_splits.append('Distribution') elif split.lower() in ['inv', 'inv.']: new_text_splits.append('Investor')