From a11a99fdc3d27989e66f2a3dece3ce4f5650ccfe Mon Sep 17 00:00:00 2001 From: Blade He Date: Tue, 3 Dec 2024 11:27:28 -0600 Subject: [PATCH] 1. Optimize instructions: not to fetch the data with "up to" statement. 2. Add exception handler in function. --- configuration/datapoint_reported_name.json | 2 +- core/data_extraction.py | 5 + .../data_extraction_prompts_config.json | 21 ++ main.py | 2 +- utils/biz_utils.py | 218 +++++++++--------- 5 files changed, 139 insertions(+), 109 deletions(-) diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json index 7e1842b..9e115b1 100644 --- a/configuration/datapoint_reported_name.json +++ b/configuration/datapoint_reported_name.json @@ -35,6 +35,7 @@ "Ratio de gastos totales" ], "german": [ + "Mit anteiliger Performance Fee in %", "TER inkl. Performance-Fee in % **)", "Gesamtgebühren", "Kostenpauschale", @@ -45,7 +46,6 @@ "kostenquote", "Gesamt kostenquote", "Betriebskostenquote des Fonds", - "Pauschalgebühr", "Total Expense Ratio in Prozent", "Annualisierte TER in % (Mit Gebührenverzicht)" ], diff --git a/core/data_extraction.py b/core/data_extraction.py index f258f1f..dfaefd3 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -813,8 +813,11 @@ class DataExtraction: instructions.append("Special cases:\n") special_cases = self.instructions_config.get("special_cases", {}) special_cases_common_list = special_cases.get("common", []) + special_cases_number = 1 for special_cases_common in special_cases_common_list: title = special_cases_common.get("title", "") + title = f"{special_cases_number}. {title} " + special_cases_number += 1 instructions.append(title) instructions.append("\n") contents_list = special_cases_common.get("contents", []) @@ -826,6 +829,8 @@ class DataExtraction: special_case_list = special_cases.get(datapoint, []) for special_case in special_case_list: title = special_case.get("title", "") + title = f"{special_cases_number}. {title} " + special_cases_number += 1 instructions.append(title) instructions.append("\n") contents_list = special_case.get("contents", []) diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 6594dde..cc6b238 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -70,6 +70,7 @@ "- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".", "- With \"TER inkl. Performance-Fee in % **)\" and \"TER exkl. Performance-Fee in % **)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".", "- With \"TER inkl. Performance-Fee in % **)\" and \"TER inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".", + "- With \"Mit anteiliger Performance Fee in %\" and \"Ohne anteilige Performance-Fee in %\", pick up the values from \"Mit anteiliger Performance Fee in %\".", "- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".", "- With both of \"Net TER\" and \"Capped Expense Ratio\", the priority is \"Net TER\", please exclude the column: \"Capped Expense Ratio\", only pick up the values from \"Net TER\".", "- With \"Gross TER\", \"Waiver\", \"Net TER\", \"Capped Expense Ratio\" as column titles, pick up the values from \"Net TER\".", @@ -132,6 +133,26 @@ "The output should be:", "{\"data\": []}" ] + }, + { + "title": "Don't fetch data with \"up to\" statement", + "contents":[ + "If the value is with \"up to\" statement, please ignore the value.", + "Example 1:", + "-----Example Start-----", + "A-Class\nB-Class\nC-Class\n", + "TER\nUp to 1.00%\nUp to 1.20%\nUp to 1.50%\n", + "-----Example End-----", + "The output should be:", + "{\"data\": []}", + "Example 2:", + "-----Example Start-----", + "A-Aktien\nB-Aktien\nC-Aktien\n", + "TER\nbis zu 1,20 % p.a.\nbis zu 2,20 % p.a.\nbis zu 1,00 % p.a.\n", + "-----Example End-----", + "The output should be:", + "{\"data\": []}" + ] } ], "ter": [ diff --git a/main.py b/main.py index 14adec9..5c58107 100644 --- a/main.py +++ b/main.py @@ -1197,7 +1197,7 @@ if __name__ == "__main__": "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - special_doc_id_list = ["451878128"] + special_doc_id_list = ["532998065"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True diff --git a/utils/biz_utils.py b/utils/biz_utils.py index fb48edc..38688b2 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -377,7 +377,8 @@ def replace_share_name_for_multilingual(text: str, share_name: str): multilingual_share_list = ["Catégorie de parts", "Classe di quote", "Kategorie Anteile", "Kategorie anteile", "Clase de participaciones", "Aandelenklasse", - "aandelenklasse", "Anteilklasse", "anteilklasse"] + "aandelenklasse", "Anteilklasse", "anteilklasse", + "Aktien", "Aktienklasse", "aktien", "aktienklasse"] for multilingual_share in multilingual_share_list: if multilingual_share in text: text = text.replace(multilingual_share, "Class") @@ -531,113 +532,116 @@ def get_currency_from_text(text: str): def update_for_currency(text: str, share_name: str, compare_list: list): - currency_in_text = get_currency_from_text(text) - with_currency = False - if currency_in_text is not None: - with_currency = True - - with_currency_list = [] - without_currency_list = [] - for index, compare in enumerate(compare_list): - # compare_split = compare.split() - with_currency_compare = False - currecy_in_compare = get_currency_from_text(compare) - if currecy_in_compare is not None: - with_currency_compare = True - - if with_currency_compare: - with_currency_list.append(index) - else: - without_currency_list.append(index) - if not with_currency and len(with_currency_list) == 0: - pass - elif not with_currency and len(with_currency_list) > 0: - share_short_name_list = [] - if share_name is not None and len(share_name.strip()) > 0: - share_short_name_list = get_share_short_name_from_text(share_name) - updated = False - if len(share_short_name_list) > 0: - if len(without_currency_list) > 0: - for index in without_currency_list: - all_in_list = True - compare_split = [split.upper() for split in compare_list[index].split()] - for share_shot_name in share_short_name_list: - if share_shot_name not in compare_split: - all_in_list = False - break - if all_in_list: - text = text + ' ' + 'USD' - if share_name is not None: - share_name = share_name + ' ' + 'USD' - updated = True - break - if not updated: - currency_list = [] - for index in with_currency_list: - all_in_list = True - compare_split = [split.upper() for split in compare_list[index].split()] - for share_shot_name in share_short_name_list: - if share_shot_name not in compare_split: - all_in_list = False - break - if all_in_list: - current_currency_list = [split for split in compare_split - if split.upper() in total_currency_list] - if len(current_currency_list) > 0: - currency_list.append(current_currency_list[-1]) - if len(currency_list) == 1: - text = text + ' ' + currency_list[0] - if share_name is not None: - share_name = share_name + ' ' + currency_list[0] - updated = True - - for index in without_currency_list: - compare_list[index] = compare_list[index] + ' ' + 'USD' - - if not updated: - text = text + ' ' + 'USD' - if share_name is not None: - share_name = share_name + ' ' + 'USD' - # return text, share_name, compare_list - elif with_currency and len(without_currency_list) == 0: - for index in without_currency_list: - compare_list[index] = compare_list[index] + ' ' + 'USD' - # return text, share_name, compare_list - else: - # return text, share_name, compare_list - pass - default_currency = 'USD' - if with_currency: - share_name_split = share_name.split() - share_name_currency = get_currency_from_text(share_name) - if share_name_currency is not None and share_name_currency in total_currency_list: - for split in share_name_split: - if split in total_currency_list and split != share_name_currency: - default_currency = split - break - new_share_name = ' '.join([split for split in share_name_split - if split not in total_currency_list - or (split == share_name_currency)]) - if share_name in text: - text = text.replace(share_name, new_share_name) + try: + currency_in_text = get_currency_from_text(text) + with_currency = False + if currency_in_text is not None: + with_currency = True + + with_currency_list = [] + without_currency_list = [] + for index, compare in enumerate(compare_list): + # compare_split = compare.split() + with_currency_compare = False + currecy_in_compare = get_currency_from_text(compare) + if currecy_in_compare is not None: + with_currency_compare = True + + if with_currency_compare: + with_currency_list.append(index) else: - text = ' '.join([split for split in text.split() - if split not in total_currency_list - or (split == share_name_currency)]) - share_name = new_share_name - - for c_i in range(len(compare_list)): - compare = compare_list[c_i] - compare_share_part = get_share_part_list([compare])[0] - compare_share_part_split = compare_share_part.split() - compare_share_part_currency_list = [] - for split in compare_share_part_split: - if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list: - compare_share_part_currency_list.append(split) - if len(compare_share_part_currency_list) > 1 and default_currency in compare_share_part_currency_list: - compare_share_part_split = [split for split in compare_share_part_split if split.upper() != default_currency] - new_compare_share_part = ' '.join(compare_share_part_split) - compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part) + without_currency_list.append(index) + if not with_currency and len(with_currency_list) == 0: + pass + elif not with_currency and len(with_currency_list) > 0: + share_short_name_list = [] + if share_name is not None and len(share_name.strip()) > 0: + share_short_name_list = get_share_short_name_from_text(share_name) + updated = False + if len(share_short_name_list) > 0: + if len(without_currency_list) > 0: + for index in without_currency_list: + all_in_list = True + compare_split = [split.upper() for split in compare_list[index].split()] + for share_shot_name in share_short_name_list: + if share_shot_name not in compare_split: + all_in_list = False + break + if all_in_list: + text = text + ' ' + 'USD' + if share_name is not None: + share_name = share_name + ' ' + 'USD' + updated = True + break + if not updated: + currency_list = [] + for index in with_currency_list: + all_in_list = True + compare_split = [split.upper() for split in compare_list[index].split()] + for share_shot_name in share_short_name_list: + if share_shot_name not in compare_split: + all_in_list = False + break + if all_in_list: + current_currency_list = [split for split in compare_split + if split.upper() in total_currency_list] + if len(current_currency_list) > 0: + currency_list.append(current_currency_list[-1]) + if len(currency_list) == 1: + text = text + ' ' + currency_list[0] + if share_name is not None: + share_name = share_name + ' ' + currency_list[0] + updated = True + + for index in without_currency_list: + compare_list[index] = compare_list[index] + ' ' + 'USD' + + if not updated: + text = text + ' ' + 'USD' + if share_name is not None: + share_name = share_name + ' ' + 'USD' + # return text, share_name, compare_list + elif with_currency and len(without_currency_list) == 0: + for index in without_currency_list: + compare_list[index] = compare_list[index] + ' ' + 'USD' + # return text, share_name, compare_list + else: + # return text, share_name, compare_list + pass + default_currency = 'USD' + if with_currency and share_name is not None: + share_name_split = share_name.split() + share_name_currency = get_currency_from_text(share_name) + if share_name_currency is not None and share_name_currency in total_currency_list: + for split in share_name_split: + if split in total_currency_list and split != share_name_currency: + default_currency = split + break + new_share_name = ' '.join([split for split in share_name_split + if split not in total_currency_list + or (split == share_name_currency)]) + if share_name in text: + text = text.replace(share_name, new_share_name) + else: + text = ' '.join([split for split in text.split() + if split not in total_currency_list + or (split == share_name_currency)]) + share_name = new_share_name + + for c_i in range(len(compare_list)): + compare = compare_list[c_i] + compare_share_part = get_share_part_list([compare])[0] + compare_share_part_split = compare_share_part.split() + compare_share_part_currency_list = [] + for split in compare_share_part_split: + if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list: + compare_share_part_currency_list.append(split) + if len(compare_share_part_currency_list) > 1 and default_currency in compare_share_part_currency_list: + compare_share_part_split = [split for split in compare_share_part_split if split.upper() != default_currency] + new_compare_share_part = ' '.join(compare_share_part_split) + compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part) + except Exception as e: + logger.error(f"Error in update_for_currency: {e}") return text, share_name, compare_list