diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 2a6f5c6..7ad6713 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -35,6 +35,9 @@ "- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".", "- With both of \"Net TER (including reimbursement)\" and \"Capped Expense Ratio\", the priority is \"Capped Expense Ratio\", please exclude the column: \"Net TER (including reimbursement)\", only pick up the values from \"Capped Expense Ratio\".", "Please ignore TER values which with the exception of performance fees or excluded performance fees." + ], + "performance_fee": [ + "The performance fees should not be the presence of the rates at which the performance fees are calculated." ] } }, diff --git a/main.py b/main.py index 42e5739..016e570 100644 --- a/main.py +++ b/main.py @@ -505,7 +505,7 @@ if __name__ == "__main__": # doc_id = "476492237" # extract_data(doc_id, pdf_folder, output_extract_data_child_folder, re_run) - special_doc_id_list = ["458291624"] + special_doc_id_list = ["491593469"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_mapping_data = True diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 21cbfde..05d7dd5 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -240,31 +240,31 @@ def replace_abbrevation(text: str): text_splits = text.split() new_text_splits = [] for split in text_splits: - if split.lower() in ['acc']: + if split.lower() in ['acc', 'acc.']: new_text_splits.append('Accumulation') - elif split.lower() in ['inc']: + elif split.lower() in ['inc', 'inc.']: new_text_splits.append('Income') - elif split.lower() in ['dist']: + elif split.lower() in ['dist', 'dist.']: new_text_splits.append('Distribution') - elif split.lower() in ['inv']: + elif split.lower() in ['inv', 'inv.']: new_text_splits.append('Investor') - elif split.lower() in ['inst', 'inst', 'institution']: + elif split.lower() in ['inst', 'inst.', 'institution']: new_text_splits.append('Institutional') - elif split.lower() in ['cap']: + elif split.lower() in ['cap', 'cap.']: new_text_splits.append('Capitalisation') - elif split.lower() in ['adm']: + elif split.lower() in ['adm', 'adm.']: new_text_splits.append('Admin') - elif split.lower() in ['adv']: + elif split.lower() in ['adv', 'adv.']: new_text_splits.append('Advantage') - elif split.lower() in ['hdg', 'hgd', '(h)']: + elif split.lower() in ['hdg', 'hgd', 'hdg.', 'hgd.', '(h)']: new_text_splits.append('Hedged') - elif split.lower() in ['cl']: + elif split.lower() in ['cl', 'cl.']: new_text_splits.append('Class') - elif split.lower() in ['ser']: + elif split.lower() in ['ser', 'ser.']: new_text_splits.append('Series') elif split.lower() in ['u.s.']: new_text_splits.append('US') - elif split.lower() in ['nc']: + elif split.lower() in ['nc', 'nc.']: new_text_splits.append('no trail') else: new_text_splits.append(split)