From 843bbbd13f101e6a138009820a41c0472c53c7a8 Mon Sep 17 00:00:00 2001 From: Blade He Date: Wed, 20 Nov 2024 17:00:22 -0600 Subject: [PATCH] dynamic loading instructions for multilingual. --- configuration/datapoint_keyword.json | 1 + ...name.json => datapoint_reported_name.json} | 82 +++++++---- core/data_extraction.py | 127 +++++++++++++++++- core/data_mapping.py | 9 +- .../data_extraction_prompts_config.json | 8 ++ main.py | 54 +++++--- 6 files changed, 231 insertions(+), 50 deletions(-) rename configuration/{datapoint_reportedname.json => datapoint_reported_name.json} (80%) diff --git a/configuration/datapoint_keyword.json b/configuration/datapoint_keyword.json index 9c44846..c71919e 100644 --- a/configuration/datapoint_keyword.json +++ b/configuration/datapoint_keyword.json @@ -396,6 +396,7 @@ "Erfolgsabhängige Verwaltungsvergütung", "Erfolgsbezogene Vergütung", "Performancegebühren", + "Performancevergütung", "Anlageerfolgsprämie", "An die Wertentwicklung des Fonds gebundene Gebühren", "Performancegebühr", diff --git a/configuration/datapoint_reportedname.json b/configuration/datapoint_reported_name.json similarity index 80% rename from configuration/datapoint_reportedname.json rename to configuration/datapoint_reported_name.json index 038c429..1d91860 100644 --- a/configuration/datapoint_reportedname.json +++ b/configuration/datapoint_reported_name.json @@ -4,6 +4,7 @@ "Synthetic TER", "Fund TER", "TER", + "TFE", "T.E.R", "TER_REF", "Total Expense Ratio", @@ -57,13 +58,11 @@ "french": [ "Le ratio de dépenses totales", "Total des frais sur encours", - "TFE", "Ratios des charges totales", "Frais sur encours", "RCT", "Ratios des charges totales", "Total des frais sur encours", - "TER", "Ratio des dépenses totales", "Ratio de dépenses totales", "coefficienti di spesa totale", @@ -162,7 +161,6 @@ "Portfolioumschlagshäufigkeit", "Umschlagshäufigkeit", "Portefeuilleumsatz", - "Portfolio Turnover Ratio", "Umsatz", "Portfolioumschlagsra", "Umschlagkennziffer", @@ -224,6 +222,7 @@ "Ongoing fee", "OGC", "OGF", + "OCF", "Operation Charge", "On Going Charges", "OC", @@ -236,15 +235,18 @@ ], "spanish": [ "Gastos Corrientes", - "Gastos Recurrentes" + "Gastos Recurrentes", + "Gastos corrientes en porcentaje", + "Gastos Corrientes 1)", + "Gastos Recurrentes 2)", + "Gastos corrientes en porcentaje 3)" ], "german": [ "Laufende Kosten", - "OCF", - "Ongoing Charge", + "Laufende Kosten in Prozent", + "Laufende Kosten 1)", + "Laufende Kosten in Prozent 2)", "Laufende Gebühren", - "laufende kosten in prozent", - "laufenden Kosten", "Betriebskosten", "Betriebsgebühren", "custos correntes", @@ -253,6 +255,9 @@ "dutch": [ "Lopende kosten", "Lopende kosten factor", + "Lopende kosten in procent", + "Lopende kosten factor 1)", + "Lopende kosten in procent 2)", "LKF", "custos correntes", "OCF", @@ -260,28 +265,40 @@ ], "french": [ "Frais courants", + "Frais courants exprimés en pourcentage", + "Frais courants 1)", + "Frais courants exprimés en pourcentage 2)", "Commission de frais opérationels" ], "italian": [ "Spese Correnti", - "Durante il funzionamento Addebiti" + "Durante il funzionamento Addebiti", + "Spese Correnti 1)", + "Durante il funzionamento Addebiti 2)" ], "portuguese": [ "Encargos Correntes", - "Custos correntes" + "Custos correntes", + "Encargos Correntes 1)", + "Custos correntes 2)" ], "swedish": [ "Årliga avgifter", - "pågående avgifter" + "pågående avgifter", + "Årliga avgifter 1)", + "pågående avgifter 2)" ], "danish": [ - "Årlig avgift" + "Årlig avgift", + "Årlig avgift 1)" ], "norwegian": [ - "Løpende gebyrer" + "Løpende gebyrer", + "Løpende gebyrer 1)" ], "malay": [ - "Tρέχουσες επιβαρύνσεις" + "Tρέχουσες επιβαρύνσεις", + "Τρέχουσες επιβαρύνσεις 2)" ] }, "performance_fee": { @@ -294,6 +311,8 @@ "spanish": [ "Comisión de Gestión sobre Resultados", "Comisión sobre Resultados", + "Comisión de Gestión sobre Resultados 1)", + "Comisión sobre Resultados 2)", "Comisión de Rentabilidad", "Comisiones de éxito", "Comisión de Éxito", @@ -306,53 +325,64 @@ "Erfolgsabhängige Verwaltungsvergütung", "Erfolgsbezogene Vergütung", "Performancegebühren", + "Erfolgsbezogene Vergütung 1)", + "Performancevergütung in Prozent", + "Performancevergütung in Prozent 2)", "Anlageerfolgsprämie", "An die Wertentwicklung des Fonds gebundene Gebühren", "Performancegebühr", - "Performance-gebühr", "Erfolgshonorare", - "Erfolgsabhän-giger Vergütung", "Erfolgshonorar", - "Performance-Fee", "Erfolgsgebühr", "perfolgsabhängige Verwaltungsvergütung", - "performanceabhängige Vergütung", - "Performance- gebühren" + "performanceabhängige Vergütung" ], "dutch": [ - "Prestatievergoeding" + "Prestatievergoeding", + "Prestatievergoeding 1)" ], "french": [ "Les commissions de surperformance", "Commission de performance", "Commissions de surperformance", - "frais de performance" + "frais de performance", + "Commission de performance exprimée en pourcentage 2)" ], "swedish": [ "Prestationsbaserad avgift", - "Performance-avgift" + "Performance-avgift", + "Prestationsbaserad avgift 1)", + "Performance-avgift 2)" ], "norwegian": [ - "prestasjonsgebyr" + "prestasjonsgebyr", + "prestasjonsgebyr 1)" ], "italian": [ "Commissioni di performance", "Commissioni legate al rendimento", + "Commissioni di performance 1)", + "Commissioni legate al rendimento 2)", "Commissioni d’incentivo" ], "portuguese": [ "Comissão de desempenho", "Custos de performance", + "Comissão de desempenho 1)", + "Custos de performance 2)", "Comissão de Gestão Variável" ], "estonian": [ - " Edukustasud aasta lõikes" + "Edukustasud aasta lõikes", + "Edukustasud aasta lõikes 1)" ], "latvian": [ - "Gada prēmijas par sasniegtajiem rezultātiem" + "Gada prēmijas par sasniegtajiem rezultātiem", + "Gada prēmijas par sasniegtajiem rezultātiem 1)" ], "Lithuanian": [ - "Metinis mokestis už veiklos rezultatu" + "Metinis mokestis už veiklos rezultatu", + "Metinis mokestis už veiklos rezultatu 2)" ] } } \ No newline at end of file diff --git a/core/data_extraction.py b/core/data_extraction.py index fe52048..587fbc9 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -57,8 +57,41 @@ class DataExtraction: self.instructions_config = self.get_instructions_config() self.datapoint_level_config = self.get_datapoint_level() self.datapoint_name_config = self.get_datapoint_name() + self.datapoint_reported_name_config, self.non_english_reported_name_config = \ + self.get_datapoint_reported_name() self.extract_way = extract_way self.output_image_folder = output_image_folder + + def get_datapoint_reported_name(self): + language_config_file = r"./configuration/language.json" + self.language_config = {} + with open(language_config_file, "r", encoding="utf-8") as file: + self.language_config = json.load(file) + + self.language_id = self.document_mapping_info_df["Language"].iloc[0] + self.language = self.language_config.get(self.language_id, None) + + datapoint_reported_name_config_file = r"./configuration/datapoint_reported_name.json" + all_datapoint_reported_name = {} + with open(datapoint_reported_name_config_file, "r", encoding="utf-8") as file: + all_datapoint_reported_name = json.load(file) + + non_english_reported_name_config = {} + datapoint_reported_name_config = {} + common_language = "english" + for datapoint, language_reported_name in all_datapoint_reported_name.items(): + reported_name_list = language_reported_name.get(common_language, []) + + if self.language != "english": + reported_name_list.extend(language_reported_name.get(self.language, [])) + non_english_reported_name_config[datapoint] = language_reported_name.get(self.language, []) + # remove duplicate reported name + reported_name_list = list(set(reported_name_list)) + # sort the reported name + reported_name_list.sort() + datapoint_reported_name_config[datapoint] = reported_name_list + return datapoint_reported_name_config, non_english_reported_name_config + def get_provider_mapping(self): if len(self.document_mapping_info_df) == 0: @@ -628,13 +661,103 @@ class DataExtraction: instructions.append("\n") instructions.append("Datapoints Reported name:\n") - reported_name_info = self.instructions_config.get("reported_name", {}) + instructions.append("Please look for relevant reported names and similar variations in the context.\n") + reported_name_info_in_instructions = self.instructions_config.get("reported_name", {}) for datapoint in datapoints: - reported_name = reported_name_info.get(datapoint, "") + reported_name_list = self.datapoint_reported_name_config.get(datapoint, []) + if len(reported_name_list) == 0: + reported_name = reported_name_info_in_instructions.get(datapoint, "") + else: + joined_reported_name = ", ".join(reported_name_list) + datapoint_name = datapoint + if datapoint_name == "performance_fee": + datapoint_name = "performance fees" + else: + datapoint_name = datapoint_name.upper() + reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}" + instructions.append(reported_name) instructions.append("\n") instructions.append("\n") + + if self.language != "english": + """ + "multilingual_reported_name": { + "describe": "Please be careful to extract relevant data by different reported names from multilingual Context.", + "regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}", + "special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nValue is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}", + "value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"] + "fund_example": "Fund 1", + "share_example": "Share 1" + } + """ + multilingual_reported_name_config = self.instructions_config.get("multilingual_reported_name", {}) + describe = multilingual_reported_name_config.get("describe", "") + regular_example_template = multilingual_reported_name_config.get("regular_example_template", "") + special_example_template_none = multilingual_reported_name_config.get("special_example_template_none", "") + value_examples = multilingual_reported_name_config.get("value_examples", []) + fund_example = multilingual_reported_name_config.get("fund_example", "") + share_example = multilingual_reported_name_config.get("share_example", "") + instructions.append("Multilingual reported name:\n") + instructions.append(f"{describe}\n") + + # set language the first char to be upper + language = self.language[0].upper() + self.language[1:] + for datapoint in datapoints: + mul_reported_name_list = self.non_english_reported_name_config.get(datapoint, []) + # shuffle the reported name list + mul_reported_name_list = list(set(mul_reported_name_list)) + if len(mul_reported_name_list) == 0: + continue + datapoint_name = datapoint + if datapoint_name == "performance_fee": + datapoint_name = "performance fees" + else: + datapoint_name = datapoint_name.upper() + example_count = 1 + none_value_example_count = 0 + for mul_reported_name in mul_reported_name_list: + if datapoint in ["ter", "performance_fee"] and example_count == 3: + break + value = value_examples[example_count % len(value_examples)] + answer = {"fund name": fund_example, + "share name": share_example, + datapoint: float(value.replace(",", "."))} + # transfer answer to string + answer = json.dumps(answer, ensure_ascii=False) + example = regular_example_template.format( + datapoint=datapoint_name, + number=example_count, + language=language, + fund_name=fund_example, + share_name=share_example, + reported_name=mul_reported_name, + value=value, + answer=answer, + ) + instructions.append(example) + instructions.append("\n") + instructions.append("\n") + + example_count += 1 + if len(mul_reported_name.split()) > 1: + if none_value_example_count != 2: + none_value_example = special_example_template_none.format( + datapoint=datapoint_name, + number=example_count, + language=language, + fund_name=fund_example, + share_name=share_example, + reported_name=mul_reported_name, + answer = json.dumps({}, ensure_ascii=False) + ) + instructions.append(none_value_example) + instructions.append("\n") + instructions.append("\n") + example_count += 1 + none_value_example_count += 1 + instructions.append("\n") instructions.append("Data business features:\n") data_business_features = self.instructions_config.get( "data_business_features", {} diff --git a/core/data_mapping.py b/core/data_mapping.py index 79cbeaf..6288d36 100644 --- a/core/data_mapping.py +++ b/core/data_mapping.py @@ -231,9 +231,12 @@ class DataMapping: excel_data_file = os.path.join( self.output_data_excel_folder, f"{self.doc_id}.xlsx" ) - with pd.ExcelWriter(excel_data_file) as writer: - mapping_data_df.to_excel(writer, sheet_name="mapping_data", index=False) - extract_data_df.to_excel(writer, sheet_name="extract_data", index=False) + try: + with pd.ExcelWriter(excel_data_file) as writer: + mapping_data_df.to_excel(writer, sheet_name="mapping_data", index=False) + extract_data_df.to_excel(writer, sheet_name="extract_data", index=False) + except Exception as e: + logger.error(f"Failed to save excel file: {e}") return mapped_data_list diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 22c18a5..6d11b18 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -19,6 +19,14 @@ "ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, Annualisierte TER in % (Mit Gebührenverzicht), Annualised TER % (with fee waiver), kostenquote, Gesamt kostenquote, etc.", "performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc." }, + "multilingual_reported_name": { + "describe": "Please be careful to extract relevant data from multilingual Context.", + "regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}", + "special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nIf value is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}", + "value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"], + "fund_example": "Fund 1", + "share_example": "Share 1" + }, "data_business_features": { "common": [ "General rules:", diff --git a/main.py b/main.py index e6ca41d..541de17 100644 --- a/main.py +++ b/main.py @@ -791,7 +791,7 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict: return metrics if __name__ == "__main__": - test_calculate_metrics() + # test_calculate_metrics() # test_replace_abbrevation() # test_translate_pdf() pdf_folder = r"/data/emea_ar/pdf/" @@ -1187,32 +1187,48 @@ if __name__ == "__main__": "546046730", "546919329" ] - # special_doc_id_list = ["507928179"] + special_doc_id_list = ["514636958", + "514636959", + "514636985", + "514636988", + "514636990", + "514636994", + "514636957", + "514636954", + "514636953", + "514636952", + "501600549", + "501600429", + "501380553", + "501380497", + "514636959", + "508981020"] + special_doc_id_list = ["514636993"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" - re_run_extract_data = False - re_run_mapping_data = False + re_run_extract_data = True + re_run_mapping_data = True force_save_total_data = False calculate_metrics = False extract_ways = ["text"] # pdf_folder = r"/data/emea_ar/small_pdf/" pdf_folder = r"/data/emea_ar/pdf/" - # for extract_way in extract_ways: - # batch_start_job( - # pdf_folder, - # page_filter_ground_truth_file, - # output_extract_data_child_folder, - # output_mapping_child_folder, - # output_extract_data_total_folder, - # output_mapping_total_folder, - # extract_way, - # special_doc_id_list, - # re_run_extract_data, - # re_run_mapping_data, - # force_save_total_data=force_save_total_data, - # calculate_metrics=calculate_metrics, - # ) + for extract_way in extract_ways: + batch_start_job( + pdf_folder, + page_filter_ground_truth_file, + output_extract_data_child_folder, + output_mapping_child_folder, + output_extract_data_total_folder, + output_mapping_total_folder, + extract_way, + special_doc_id_list, + re_run_extract_data, + re_run_mapping_data, + force_save_total_data=force_save_total_data, + calculate_metrics=calculate_metrics, + ) # test_data_extraction_metrics() # test_mapping_raw_name()