1. Optimize instructions: not to fetch the data with "up to" statement.

2. Add exception handler in function.
This commit is contained in:
Blade He 2024-12-03 11:27:28 -06:00
parent bc32860f87
commit a11a99fdc3
5 changed files with 139 additions and 109 deletions

View File

@ -35,6 +35,7 @@
"Ratio de gastos totales"
],
"german": [
"Mit anteiliger Performance Fee in %",
"TER inkl. Performance-Fee in % **)",
"Gesamtgebühren",
"Kostenpauschale",
@ -45,7 +46,6 @@
"kostenquote",
"Gesamt kostenquote",
"Betriebskostenquote des Fonds",
"Pauschalgebühr",
"Total Expense Ratio in Prozent",
"Annualisierte TER in % (Mit Gebührenverzicht)"
],

View File

@ -813,8 +813,11 @@ class DataExtraction:
instructions.append("Special cases:\n")
special_cases = self.instructions_config.get("special_cases", {})
special_cases_common_list = special_cases.get("common", [])
special_cases_number = 1
for special_cases_common in special_cases_common_list:
title = special_cases_common.get("title", "")
title = f"{special_cases_number}. {title} "
special_cases_number += 1
instructions.append(title)
instructions.append("\n")
contents_list = special_cases_common.get("contents", [])
@ -826,6 +829,8 @@ class DataExtraction:
special_case_list = special_cases.get(datapoint, [])
for special_case in special_case_list:
title = special_case.get("title", "")
title = f"{special_cases_number}. {title} "
special_cases_number += 1
instructions.append(title)
instructions.append("\n")
contents_list = special_case.get("contents", [])

View File

@ -70,6 +70,7 @@
"- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".",
"- With \"TER inkl. Performance-Fee in % **)\" and \"TER exkl. Performance-Fee in % **)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".",
"- With \"TER inkl. Performance-Fee in % **)\" and \"TER inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".",
"- With \"Mit anteiliger Performance Fee in %\" and \"Ohne anteilige Performance-Fee in %\", pick up the values from \"Mit anteiliger Performance Fee in %\".",
"- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".",
"- With both of \"Net TER\" and \"Capped Expense Ratio\", the priority is \"Net TER\", please exclude the column: \"Capped Expense Ratio\", only pick up the values from \"Net TER\".",
"- With \"Gross TER\", \"Waiver\", \"Net TER\", \"Capped Expense Ratio\" as column titles, pick up the values from \"Net TER\".",
@ -132,6 +133,26 @@
"The output should be:",
"{\"data\": []}"
]
},
{
"title": "Don't fetch data with \"up to\" statement",
"contents":[
"If the value is with \"up to\" statement, please ignore the value.",
"Example 1:",
"-----Example Start-----",
"A-Class\nB-Class\nC-Class\n",
"TER\nUp to 1.00%\nUp to 1.20%\nUp to 1.50%\n",
"-----Example End-----",
"The output should be:",
"{\"data\": []}",
"Example 2:",
"-----Example Start-----",
"A-Aktien\nB-Aktien\nC-Aktien\n",
"TER\nbis zu 1,20 % p.a.\nbis zu 2,20 % p.a.\nbis zu 1,00 % p.a.\n",
"-----Example End-----",
"The output should be:",
"{\"data\": []}"
]
}
],
"ter": [

View File

@ -1197,7 +1197,7 @@ if __name__ == "__main__":
"534535767"
]
special_doc_id_list = check_db_mapping_doc_id_list
special_doc_id_list = ["451878128"]
special_doc_id_list = ["532998065"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True

View File

@ -377,7 +377,8 @@ def replace_share_name_for_multilingual(text: str, share_name: str):
multilingual_share_list = ["Catégorie de parts", "Classe di quote",
"Kategorie Anteile", "Kategorie anteile",
"Clase de participaciones", "Aandelenklasse",
"aandelenklasse", "Anteilklasse", "anteilklasse"]
"aandelenklasse", "Anteilklasse", "anteilklasse",
"Aktien", "Aktienklasse", "aktien", "aktienklasse"]
for multilingual_share in multilingual_share_list:
if multilingual_share in text:
text = text.replace(multilingual_share, "Class")
@ -531,113 +532,116 @@ def get_currency_from_text(text: str):
def update_for_currency(text: str, share_name: str, compare_list: list):
currency_in_text = get_currency_from_text(text)
with_currency = False
if currency_in_text is not None:
with_currency = True
try:
currency_in_text = get_currency_from_text(text)
with_currency = False
if currency_in_text is not None:
with_currency = True
with_currency_list = []
without_currency_list = []
for index, compare in enumerate(compare_list):
# compare_split = compare.split()
with_currency_compare = False
currecy_in_compare = get_currency_from_text(compare)
if currecy_in_compare is not None:
with_currency_compare = True
with_currency_list = []
without_currency_list = []
for index, compare in enumerate(compare_list):
# compare_split = compare.split()
with_currency_compare = False
currecy_in_compare = get_currency_from_text(compare)
if currecy_in_compare is not None:
with_currency_compare = True
if with_currency_compare:
with_currency_list.append(index)
else:
without_currency_list.append(index)
if not with_currency and len(with_currency_list) == 0:
pass
elif not with_currency and len(with_currency_list) > 0:
share_short_name_list = []
if share_name is not None and len(share_name.strip()) > 0:
share_short_name_list = get_share_short_name_from_text(share_name)
updated = False
if len(share_short_name_list) > 0:
if len(without_currency_list) > 0:
for index in without_currency_list:
all_in_list = True
compare_split = [split.upper() for split in compare_list[index].split()]
for share_shot_name in share_short_name_list:
if share_shot_name not in compare_split:
all_in_list = False
break
if all_in_list:
text = text + ' ' + 'USD'
if share_name is not None:
share_name = share_name + ' ' + 'USD'
updated = True
break
if not updated:
currency_list = []
for index in with_currency_list:
all_in_list = True
compare_split = [split.upper() for split in compare_list[index].split()]
for share_shot_name in share_short_name_list:
if share_shot_name not in compare_split:
all_in_list = False
break
if all_in_list:
current_currency_list = [split for split in compare_split
if split.upper() in total_currency_list]
if len(current_currency_list) > 0:
currency_list.append(current_currency_list[-1])
if len(currency_list) == 1:
text = text + ' ' + currency_list[0]
if share_name is not None:
share_name = share_name + ' ' + currency_list[0]
updated = True
for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD'
if not updated:
text = text + ' ' + 'USD'
if share_name is not None:
share_name = share_name + ' ' + 'USD'
# return text, share_name, compare_list
elif with_currency and len(without_currency_list) == 0:
for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD'
# return text, share_name, compare_list
else:
# return text, share_name, compare_list
pass
default_currency = 'USD'
if with_currency:
share_name_split = share_name.split()
share_name_currency = get_currency_from_text(share_name)
if share_name_currency is not None and share_name_currency in total_currency_list:
for split in share_name_split:
if split in total_currency_list and split != share_name_currency:
default_currency = split
break
new_share_name = ' '.join([split for split in share_name_split
if split not in total_currency_list
or (split == share_name_currency)])
if share_name in text:
text = text.replace(share_name, new_share_name)
if with_currency_compare:
with_currency_list.append(index)
else:
text = ' '.join([split for split in text.split()
if split not in total_currency_list
or (split == share_name_currency)])
share_name = new_share_name
without_currency_list.append(index)
if not with_currency and len(with_currency_list) == 0:
pass
elif not with_currency and len(with_currency_list) > 0:
share_short_name_list = []
if share_name is not None and len(share_name.strip()) > 0:
share_short_name_list = get_share_short_name_from_text(share_name)
updated = False
if len(share_short_name_list) > 0:
if len(without_currency_list) > 0:
for index in without_currency_list:
all_in_list = True
compare_split = [split.upper() for split in compare_list[index].split()]
for share_shot_name in share_short_name_list:
if share_shot_name not in compare_split:
all_in_list = False
break
if all_in_list:
text = text + ' ' + 'USD'
if share_name is not None:
share_name = share_name + ' ' + 'USD'
updated = True
break
if not updated:
currency_list = []
for index in with_currency_list:
all_in_list = True
compare_split = [split.upper() for split in compare_list[index].split()]
for share_shot_name in share_short_name_list:
if share_shot_name not in compare_split:
all_in_list = False
break
if all_in_list:
current_currency_list = [split for split in compare_split
if split.upper() in total_currency_list]
if len(current_currency_list) > 0:
currency_list.append(current_currency_list[-1])
if len(currency_list) == 1:
text = text + ' ' + currency_list[0]
if share_name is not None:
share_name = share_name + ' ' + currency_list[0]
updated = True
for c_i in range(len(compare_list)):
compare = compare_list[c_i]
compare_share_part = get_share_part_list([compare])[0]
compare_share_part_split = compare_share_part.split()
compare_share_part_currency_list = []
for split in compare_share_part_split:
if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list:
compare_share_part_currency_list.append(split)
if len(compare_share_part_currency_list) > 1 and default_currency in compare_share_part_currency_list:
compare_share_part_split = [split for split in compare_share_part_split if split.upper() != default_currency]
new_compare_share_part = ' '.join(compare_share_part_split)
compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part)
for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD'
if not updated:
text = text + ' ' + 'USD'
if share_name is not None:
share_name = share_name + ' ' + 'USD'
# return text, share_name, compare_list
elif with_currency and len(without_currency_list) == 0:
for index in without_currency_list:
compare_list[index] = compare_list[index] + ' ' + 'USD'
# return text, share_name, compare_list
else:
# return text, share_name, compare_list
pass
default_currency = 'USD'
if with_currency and share_name is not None:
share_name_split = share_name.split()
share_name_currency = get_currency_from_text(share_name)
if share_name_currency is not None and share_name_currency in total_currency_list:
for split in share_name_split:
if split in total_currency_list and split != share_name_currency:
default_currency = split
break
new_share_name = ' '.join([split for split in share_name_split
if split not in total_currency_list
or (split == share_name_currency)])
if share_name in text:
text = text.replace(share_name, new_share_name)
else:
text = ' '.join([split for split in text.split()
if split not in total_currency_list
or (split == share_name_currency)])
share_name = new_share_name
for c_i in range(len(compare_list)):
compare = compare_list[c_i]
compare_share_part = get_share_part_list([compare])[0]
compare_share_part_split = compare_share_part.split()
compare_share_part_currency_list = []
for split in compare_share_part_split:
if split.upper() in total_currency_list and split.upper() not in compare_share_part_currency_list:
compare_share_part_currency_list.append(split)
if len(compare_share_part_currency_list) > 1 and default_currency in compare_share_part_currency_list:
compare_share_part_split = [split for split in compare_share_part_split if split.upper() != default_currency]
new_compare_share_part = ' '.join(compare_share_part_split)
compare_list[c_i] = compare.replace(compare_share_part, new_compare_share_part)
except Exception as e:
logger.error(f"Error in update_for_currency: {e}")
return text, share_name, compare_list