dynamic loading instructions for multilingual.

This commit is contained in:
Blade He 2024-11-20 17:00:22 -06:00
parent 067d89e0f9
commit 843bbbd13f
6 changed files with 231 additions and 50 deletions

View File

@ -396,6 +396,7 @@
"Erfolgsabhängige Verwaltungsvergütung",
"Erfolgsbezogene Vergütung",
"Performancegebühren",
"Performancevergütung",
"Anlageerfolgsprämie",
"An die Wertentwicklung des Fonds gebundene Gebühren",
"Performancegebühr",

View File

@ -4,6 +4,7 @@
"Synthetic TER",
"Fund TER",
"TER",
"TFE",
"T.E.R",
"TER_REF",
"Total Expense Ratio",
@ -57,13 +58,11 @@
"french": [
"Le ratio de dépenses totales",
"Total des frais sur encours",
"TFE",
"Ratios des charges totales",
"Frais sur encours",
"RCT",
"Ratios des charges totales",
"Total des frais sur encours",
"TER",
"Ratio des dépenses totales",
"Ratio de dépenses totales",
"coefficienti di spesa totale",
@ -162,7 +161,6 @@
"Portfolioumschlagshäufigkeit",
"Umschlagshäufigkeit",
"Portefeuilleumsatz",
"Portfolio Turnover Ratio",
"Umsatz",
"Portfolioumschlagsra",
"Umschlagkennziffer",
@ -224,6 +222,7 @@
"Ongoing fee",
"OGC",
"OGF",
"OCF",
"Operation Charge",
"On Going Charges",
"OC",
@ -236,15 +235,18 @@
],
"spanish": [
"Gastos Corrientes",
"Gastos Recurrentes"
"Gastos Recurrentes",
"Gastos corrientes en porcentaje",
"Gastos Corrientes 1)",
"Gastos Recurrentes 2)",
"Gastos corrientes en porcentaje 3)"
],
"german": [
"Laufende Kosten",
"OCF",
"Ongoing Charge",
"Laufende Kosten in Prozent",
"Laufende Kosten 1)",
"Laufende Kosten in Prozent 2)",
"Laufende Gebühren",
"laufende kosten in prozent",
"laufenden Kosten",
"Betriebskosten",
"Betriebsgebühren",
"custos correntes",
@ -253,6 +255,9 @@
"dutch": [
"Lopende kosten",
"Lopende kosten factor",
"Lopende kosten in procent",
"Lopende kosten factor 1)",
"Lopende kosten in procent 2)",
"LKF",
"custos correntes",
"OCF",
@ -260,28 +265,40 @@
],
"french": [
"Frais courants",
"Frais courants exprimés en pourcentage",
"Frais courants 1)",
"Frais courants exprimés en pourcentage 2)",
"Commission de frais opérationels"
],
"italian": [
"Spese Correnti",
"Durante il funzionamento Addebiti"
"Durante il funzionamento Addebiti",
"Spese Correnti 1)",
"Durante il funzionamento Addebiti 2)"
],
"portuguese": [
"Encargos Correntes",
"Custos correntes"
"Custos correntes",
"Encargos Correntes 1)",
"Custos correntes 2)"
],
"swedish": [
"Årliga avgifter",
"pågående avgifter"
"pågående avgifter",
"Årliga avgifter 1)",
"pågående avgifter 2)"
],
"danish": [
"Årlig avgift"
"Årlig avgift",
"Årlig avgift 1)"
],
"norwegian": [
"Løpende gebyrer"
"Løpende gebyrer",
"Løpende gebyrer 1)"
],
"malay": [
"Tρέχουσες επιβαρύνσεις"
"Tρέχουσες επιβαρύνσεις",
"Τρέχουσες επιβαρύνσεις 2)"
]
},
"performance_fee": {
@ -294,6 +311,8 @@
"spanish": [
"Comisión de Gestión sobre Resultados",
"Comisión sobre Resultados",
"Comisión de Gestión sobre Resultados 1)",
"Comisión sobre Resultados 2)",
"Comisión de Rentabilidad",
"Comisiones de éxito",
"Comisión de Éxito",
@ -306,53 +325,64 @@
"Erfolgsabhängige Verwaltungsvergütung",
"Erfolgsbezogene Vergütung",
"Performancegebühren",
"Erfolgsbezogene Vergütung 1)",
"Performancevergütung in Prozent",
"Performancevergütung in Prozent 2)",
"Anlageerfolgsprämie",
"An die Wertentwicklung des Fonds gebundene Gebühren",
"Performancegebühr",
"Performance-gebühr",
"Erfolgshonorare",
"Erfolgsabhän-giger Vergütung",
"Erfolgshonorar",
"Performance-Fee",
"Erfolgsgebühr",
"perfolgsabhängige Verwaltungsvergütung",
"performanceabhängige Vergütung",
"Performance- gebühren"
"performanceabhängige Vergütung"
],
"dutch": [
"Prestatievergoeding"
"Prestatievergoeding",
"Prestatievergoeding 1)"
],
"french": [
"Les commissions de surperformance",
"Commission de performance",
"Commissions de surperformance",
"frais de performance"
"frais de performance",
"Commission de performance exprimée en pourcentage 2)"
],
"swedish": [
"Prestationsbaserad avgift",
"Performance-avgift"
"Performance-avgift",
"Prestationsbaserad avgift 1)",
"Performance-avgift 2)"
],
"norwegian": [
"prestasjonsgebyr"
"prestasjonsgebyr",
"prestasjonsgebyr 1)"
],
"italian": [
"Commissioni di performance",
"Commissioni legate al rendimento",
"Commissioni di performance 1)",
"Commissioni legate al rendimento 2)",
"Commissioni dincentivo"
],
"portuguese": [
"Comissão de desempenho",
"Custos de performance",
"Comissão de desempenho 1)",
"Custos de performance 2)",
"Comissão de Gestão Variável"
],
"estonian": [
" Edukustasud aasta lõikes"
"Edukustasud aasta lõikes",
"Edukustasud aasta lõikes 1)"
],
"latvian": [
"Gada prēmijas par sasniegtajiem rezultātiem"
"Gada prēmijas par sasniegtajiem rezultātiem",
"Gada prēmijas par sasniegtajiem rezultātiem 1)"
],
"Lithuanian": [
"Metinis mokestis už veiklos rezultatu"
"Metinis mokestis už veiklos rezultatu",
"Metinis mokestis už veiklos rezultatu 2)"
]
}
}

View File

@ -57,9 +57,42 @@ class DataExtraction:
self.instructions_config = self.get_instructions_config()
self.datapoint_level_config = self.get_datapoint_level()
self.datapoint_name_config = self.get_datapoint_name()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name()
self.extract_way = extract_way
self.output_image_folder = output_image_folder
def get_datapoint_reported_name(self):
language_config_file = r"./configuration/language.json"
self.language_config = {}
with open(language_config_file, "r", encoding="utf-8") as file:
self.language_config = json.load(file)
self.language_id = self.document_mapping_info_df["Language"].iloc[0]
self.language = self.language_config.get(self.language_id, None)
datapoint_reported_name_config_file = r"./configuration/datapoint_reported_name.json"
all_datapoint_reported_name = {}
with open(datapoint_reported_name_config_file, "r", encoding="utf-8") as file:
all_datapoint_reported_name = json.load(file)
non_english_reported_name_config = {}
datapoint_reported_name_config = {}
common_language = "english"
for datapoint, language_reported_name in all_datapoint_reported_name.items():
reported_name_list = language_reported_name.get(common_language, [])
if self.language != "english":
reported_name_list.extend(language_reported_name.get(self.language, []))
non_english_reported_name_config[datapoint] = language_reported_name.get(self.language, [])
# remove duplicate reported name
reported_name_list = list(set(reported_name_list))
# sort the reported name
reported_name_list.sort()
datapoint_reported_name_config[datapoint] = reported_name_list
return datapoint_reported_name_config, non_english_reported_name_config
def get_provider_mapping(self):
if len(self.document_mapping_info_df) == 0:
return pd.DataFrame()
@ -628,13 +661,103 @@ class DataExtraction:
instructions.append("\n")
instructions.append("Datapoints Reported name:\n")
reported_name_info = self.instructions_config.get("reported_name", {})
instructions.append("Please look for relevant reported names and similar variations in the context.\n")
reported_name_info_in_instructions = self.instructions_config.get("reported_name", {})
for datapoint in datapoints:
reported_name = reported_name_info.get(datapoint, "")
reported_name_list = self.datapoint_reported_name_config.get(datapoint, [])
if len(reported_name_list) == 0:
reported_name = reported_name_info_in_instructions.get(datapoint, "")
else:
joined_reported_name = ", ".join(reported_name_list)
datapoint_name = datapoint
if datapoint_name == "performance_fee":
datapoint_name = "performance fees"
else:
datapoint_name = datapoint_name.upper()
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
instructions.append(reported_name)
instructions.append("\n")
instructions.append("\n")
if self.language != "english":
"""
"multilingual_reported_name": {
"describe": "Please be careful to extract relevant data by different reported names from multilingual Context.",
"regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}",
"special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nValue is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}",
"value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"]
"fund_example": "Fund 1",
"share_example": "Share 1"
}
"""
multilingual_reported_name_config = self.instructions_config.get("multilingual_reported_name", {})
describe = multilingual_reported_name_config.get("describe", "")
regular_example_template = multilingual_reported_name_config.get("regular_example_template", "")
special_example_template_none = multilingual_reported_name_config.get("special_example_template_none", "")
value_examples = multilingual_reported_name_config.get("value_examples", [])
fund_example = multilingual_reported_name_config.get("fund_example", "")
share_example = multilingual_reported_name_config.get("share_example", "")
instructions.append("Multilingual reported name:\n")
instructions.append(f"{describe}\n")
# set language the first char to be upper
language = self.language[0].upper() + self.language[1:]
for datapoint in datapoints:
mul_reported_name_list = self.non_english_reported_name_config.get(datapoint, [])
# shuffle the reported name list
mul_reported_name_list = list(set(mul_reported_name_list))
if len(mul_reported_name_list) == 0:
continue
datapoint_name = datapoint
if datapoint_name == "performance_fee":
datapoint_name = "performance fees"
else:
datapoint_name = datapoint_name.upper()
example_count = 1
none_value_example_count = 0
for mul_reported_name in mul_reported_name_list:
if datapoint in ["ter", "performance_fee"] and example_count == 3:
break
value = value_examples[example_count % len(value_examples)]
answer = {"fund name": fund_example,
"share name": share_example,
datapoint: float(value.replace(",", "."))}
# transfer answer to string
answer = json.dumps(answer, ensure_ascii=False)
example = regular_example_template.format(
datapoint=datapoint_name,
number=example_count,
language=language,
fund_name=fund_example,
share_name=share_example,
reported_name=mul_reported_name,
value=value,
answer=answer,
)
instructions.append(example)
instructions.append("\n")
instructions.append("\n")
example_count += 1
if len(mul_reported_name.split()) > 1:
if none_value_example_count != 2:
none_value_example = special_example_template_none.format(
datapoint=datapoint_name,
number=example_count,
language=language,
fund_name=fund_example,
share_name=share_example,
reported_name=mul_reported_name,
answer = json.dumps({}, ensure_ascii=False)
)
instructions.append(none_value_example)
instructions.append("\n")
instructions.append("\n")
example_count += 1
none_value_example_count += 1
instructions.append("\n")
instructions.append("Data business features:\n")
data_business_features = self.instructions_config.get(
"data_business_features", {}

View File

@ -231,9 +231,12 @@ class DataMapping:
excel_data_file = os.path.join(
self.output_data_excel_folder, f"{self.doc_id}.xlsx"
)
with pd.ExcelWriter(excel_data_file) as writer:
mapping_data_df.to_excel(writer, sheet_name="mapping_data", index=False)
extract_data_df.to_excel(writer, sheet_name="extract_data", index=False)
try:
with pd.ExcelWriter(excel_data_file) as writer:
mapping_data_df.to_excel(writer, sheet_name="mapping_data", index=False)
extract_data_df.to_excel(writer, sheet_name="extract_data", index=False)
except Exception as e:
logger.error(f"Failed to save excel file: {e}")
return mapped_data_list

View File

@ -19,6 +19,14 @@
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, Annualisierte TER in % (Mit Gebührenverzicht), Annualised TER % (with fee waiver), kostenquote, Gesamt kostenquote, etc.",
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
},
"multilingual_reported_name": {
"describe": "Please be careful to extract relevant data from multilingual Context.",
"regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}",
"special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nIf value is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}",
"value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"],
"fund_example": "Fund 1",
"share_example": "Share 1"
},
"data_business_features": {
"common": [
"General rules:",

54
main.py
View File

@ -791,7 +791,7 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict:
return metrics
if __name__ == "__main__":
test_calculate_metrics()
# test_calculate_metrics()
# test_replace_abbrevation()
# test_translate_pdf()
pdf_folder = r"/data/emea_ar/pdf/"
@ -1187,32 +1187,48 @@ if __name__ == "__main__":
"546046730",
"546919329"
]
# special_doc_id_list = ["507928179"]
special_doc_id_list = ["514636958",
"514636959",
"514636985",
"514636988",
"514636990",
"514636994",
"514636957",
"514636954",
"514636953",
"514636952",
"501600549",
"501600429",
"501380553",
"501380497",
"514636959",
"508981020"]
special_doc_id_list = ["514636993"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = False
re_run_mapping_data = False
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = False
calculate_metrics = False
extract_ways = ["text"]
# pdf_folder = r"/data/emea_ar/small_pdf/"
pdf_folder = r"/data/emea_ar/pdf/"
# for extract_way in extract_ways:
# batch_start_job(
# pdf_folder,
# page_filter_ground_truth_file,
# output_extract_data_child_folder,
# output_mapping_child_folder,
# output_extract_data_total_folder,
# output_mapping_total_folder,
# extract_way,
# special_doc_id_list,
# re_run_extract_data,
# re_run_mapping_data,
# force_save_total_data=force_save_total_data,
# calculate_metrics=calculate_metrics,
# )
for extract_way in extract_ways:
batch_start_job(
pdf_folder,
page_filter_ground_truth_file,
output_extract_data_child_folder,
output_mapping_child_folder,
output_extract_data_total_folder,
output_mapping_total_folder,
extract_way,
special_doc_id_list,
re_run_extract_data,
re_run_mapping_data,
force_save_total_data=force_save_total_data,
calculate_metrics=calculate_metrics,
)
# test_data_extraction_metrics()
# test_mapping_raw_name()