dynamic loading instructions for multilingual.
This commit is contained in:
parent
067d89e0f9
commit
843bbbd13f
|
|
@ -396,6 +396,7 @@
|
||||||
"Erfolgsabhängige Verwaltungsvergütung",
|
"Erfolgsabhängige Verwaltungsvergütung",
|
||||||
"Erfolgsbezogene Vergütung",
|
"Erfolgsbezogene Vergütung",
|
||||||
"Performancegebühren",
|
"Performancegebühren",
|
||||||
|
"Performancevergütung",
|
||||||
"Anlageerfolgsprämie",
|
"Anlageerfolgsprämie",
|
||||||
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
||||||
"Performancegebühr",
|
"Performancegebühr",
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
"Synthetic TER",
|
"Synthetic TER",
|
||||||
"Fund TER",
|
"Fund TER",
|
||||||
"TER",
|
"TER",
|
||||||
|
"TFE",
|
||||||
"T.E.R",
|
"T.E.R",
|
||||||
"TER_REF",
|
"TER_REF",
|
||||||
"Total Expense Ratio",
|
"Total Expense Ratio",
|
||||||
|
|
@ -57,13 +58,11 @@
|
||||||
"french": [
|
"french": [
|
||||||
"Le ratio de dépenses totales",
|
"Le ratio de dépenses totales",
|
||||||
"Total des frais sur encours",
|
"Total des frais sur encours",
|
||||||
"TFE",
|
|
||||||
"Ratios des charges totales",
|
"Ratios des charges totales",
|
||||||
"Frais sur encours",
|
"Frais sur encours",
|
||||||
"RCT",
|
"RCT",
|
||||||
"Ratios des charges totales",
|
"Ratios des charges totales",
|
||||||
"Total des frais sur encours",
|
"Total des frais sur encours",
|
||||||
"TER",
|
|
||||||
"Ratio des dépenses totales",
|
"Ratio des dépenses totales",
|
||||||
"Ratio de dépenses totales",
|
"Ratio de dépenses totales",
|
||||||
"coefficienti di spesa totale",
|
"coefficienti di spesa totale",
|
||||||
|
|
@ -162,7 +161,6 @@
|
||||||
"Portfolioumschlagshäufigkeit",
|
"Portfolioumschlagshäufigkeit",
|
||||||
"Umschlagshäufigkeit",
|
"Umschlagshäufigkeit",
|
||||||
"Portefeuilleumsatz",
|
"Portefeuilleumsatz",
|
||||||
"Portfolio Turnover Ratio",
|
|
||||||
"Umsatz",
|
"Umsatz",
|
||||||
"Portfolioumschlagsra",
|
"Portfolioumschlagsra",
|
||||||
"Umschlagkennziffer",
|
"Umschlagkennziffer",
|
||||||
|
|
@ -224,6 +222,7 @@
|
||||||
"Ongoing fee",
|
"Ongoing fee",
|
||||||
"OGC",
|
"OGC",
|
||||||
"OGF",
|
"OGF",
|
||||||
|
"OCF",
|
||||||
"Operation Charge",
|
"Operation Charge",
|
||||||
"On Going Charges",
|
"On Going Charges",
|
||||||
"OC",
|
"OC",
|
||||||
|
|
@ -236,15 +235,18 @@
|
||||||
],
|
],
|
||||||
"spanish": [
|
"spanish": [
|
||||||
"Gastos Corrientes",
|
"Gastos Corrientes",
|
||||||
"Gastos Recurrentes"
|
"Gastos Recurrentes",
|
||||||
|
"Gastos corrientes en porcentaje",
|
||||||
|
"Gastos Corrientes 1)",
|
||||||
|
"Gastos Recurrentes 2)",
|
||||||
|
"Gastos corrientes en porcentaje 3)"
|
||||||
],
|
],
|
||||||
"german": [
|
"german": [
|
||||||
"Laufende Kosten",
|
"Laufende Kosten",
|
||||||
"OCF",
|
"Laufende Kosten in Prozent",
|
||||||
"Ongoing Charge",
|
"Laufende Kosten 1)",
|
||||||
|
"Laufende Kosten in Prozent 2)",
|
||||||
"Laufende Gebühren",
|
"Laufende Gebühren",
|
||||||
"laufende kosten in prozent",
|
|
||||||
"laufenden Kosten",
|
|
||||||
"Betriebskosten",
|
"Betriebskosten",
|
||||||
"Betriebsgebühren",
|
"Betriebsgebühren",
|
||||||
"custos correntes",
|
"custos correntes",
|
||||||
|
|
@ -253,6 +255,9 @@
|
||||||
"dutch": [
|
"dutch": [
|
||||||
"Lopende kosten",
|
"Lopende kosten",
|
||||||
"Lopende kosten factor",
|
"Lopende kosten factor",
|
||||||
|
"Lopende kosten in procent",
|
||||||
|
"Lopende kosten factor 1)",
|
||||||
|
"Lopende kosten in procent 2)",
|
||||||
"LKF",
|
"LKF",
|
||||||
"custos correntes",
|
"custos correntes",
|
||||||
"OCF",
|
"OCF",
|
||||||
|
|
@ -260,28 +265,40 @@
|
||||||
],
|
],
|
||||||
"french": [
|
"french": [
|
||||||
"Frais courants",
|
"Frais courants",
|
||||||
|
"Frais courants exprimés en pourcentage",
|
||||||
|
"Frais courants 1)",
|
||||||
|
"Frais courants exprimés en pourcentage 2)",
|
||||||
"Commission de frais opérationels"
|
"Commission de frais opérationels"
|
||||||
],
|
],
|
||||||
"italian": [
|
"italian": [
|
||||||
"Spese Correnti",
|
"Spese Correnti",
|
||||||
"Durante il funzionamento Addebiti"
|
"Durante il funzionamento Addebiti",
|
||||||
|
"Spese Correnti 1)",
|
||||||
|
"Durante il funzionamento Addebiti 2)"
|
||||||
],
|
],
|
||||||
"portuguese": [
|
"portuguese": [
|
||||||
"Encargos Correntes",
|
"Encargos Correntes",
|
||||||
"Custos correntes"
|
"Custos correntes",
|
||||||
|
"Encargos Correntes 1)",
|
||||||
|
"Custos correntes 2)"
|
||||||
],
|
],
|
||||||
"swedish": [
|
"swedish": [
|
||||||
"Årliga avgifter",
|
"Årliga avgifter",
|
||||||
"pågående avgifter"
|
"pågående avgifter",
|
||||||
|
"Årliga avgifter 1)",
|
||||||
|
"pågående avgifter 2)"
|
||||||
],
|
],
|
||||||
"danish": [
|
"danish": [
|
||||||
"Årlig avgift"
|
"Årlig avgift",
|
||||||
|
"Årlig avgift 1)"
|
||||||
],
|
],
|
||||||
"norwegian": [
|
"norwegian": [
|
||||||
"Løpende gebyrer"
|
"Løpende gebyrer",
|
||||||
|
"Løpende gebyrer 1)"
|
||||||
],
|
],
|
||||||
"malay": [
|
"malay": [
|
||||||
"Tρέχουσες επιβαρύνσεις"
|
"Tρέχουσες επιβαρύνσεις",
|
||||||
|
"Τρέχουσες επιβαρύνσεις 2)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"performance_fee": {
|
"performance_fee": {
|
||||||
|
|
@ -294,6 +311,8 @@
|
||||||
"spanish": [
|
"spanish": [
|
||||||
"Comisión de Gestión sobre Resultados",
|
"Comisión de Gestión sobre Resultados",
|
||||||
"Comisión sobre Resultados",
|
"Comisión sobre Resultados",
|
||||||
|
"Comisión de Gestión sobre Resultados 1)",
|
||||||
|
"Comisión sobre Resultados 2)",
|
||||||
"Comisión de Rentabilidad",
|
"Comisión de Rentabilidad",
|
||||||
"Comisiones de éxito",
|
"Comisiones de éxito",
|
||||||
"Comisión de Éxito",
|
"Comisión de Éxito",
|
||||||
|
|
@ -306,53 +325,64 @@
|
||||||
"Erfolgsabhängige Verwaltungsvergütung",
|
"Erfolgsabhängige Verwaltungsvergütung",
|
||||||
"Erfolgsbezogene Vergütung",
|
"Erfolgsbezogene Vergütung",
|
||||||
"Performancegebühren",
|
"Performancegebühren",
|
||||||
|
"Erfolgsbezogene Vergütung 1)",
|
||||||
|
"Performancevergütung in Prozent",
|
||||||
|
"Performancevergütung in Prozent 2)",
|
||||||
"Anlageerfolgsprämie",
|
"Anlageerfolgsprämie",
|
||||||
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
"An die Wertentwicklung des Fonds gebundene Gebühren",
|
||||||
"Performancegebühr",
|
"Performancegebühr",
|
||||||
"Performance-gebühr",
|
|
||||||
"Erfolgshonorare",
|
"Erfolgshonorare",
|
||||||
"Erfolgsabhän-giger Vergütung",
|
|
||||||
"Erfolgshonorar",
|
"Erfolgshonorar",
|
||||||
"Performance-Fee",
|
|
||||||
"Erfolgsgebühr",
|
"Erfolgsgebühr",
|
||||||
"perfolgsabhängige Verwaltungsvergütung",
|
"perfolgsabhängige Verwaltungsvergütung",
|
||||||
"performanceabhängige Vergütung",
|
"performanceabhängige Vergütung"
|
||||||
"Performance- gebühren"
|
|
||||||
],
|
],
|
||||||
"dutch": [
|
"dutch": [
|
||||||
"Prestatievergoeding"
|
"Prestatievergoeding",
|
||||||
|
"Prestatievergoeding 1)"
|
||||||
],
|
],
|
||||||
"french": [
|
"french": [
|
||||||
"Les commissions de surperformance",
|
"Les commissions de surperformance",
|
||||||
"Commission de performance",
|
"Commission de performance",
|
||||||
"Commissions de surperformance",
|
"Commissions de surperformance",
|
||||||
"frais de performance"
|
"frais de performance",
|
||||||
|
"Commission de performance exprimée en pourcentage 2)"
|
||||||
],
|
],
|
||||||
"swedish": [
|
"swedish": [
|
||||||
"Prestationsbaserad avgift",
|
"Prestationsbaserad avgift",
|
||||||
"Performance-avgift"
|
"Performance-avgift",
|
||||||
|
"Prestationsbaserad avgift 1)",
|
||||||
|
"Performance-avgift 2)"
|
||||||
],
|
],
|
||||||
"norwegian": [
|
"norwegian": [
|
||||||
"prestasjonsgebyr"
|
"prestasjonsgebyr",
|
||||||
|
"prestasjonsgebyr 1)"
|
||||||
],
|
],
|
||||||
"italian": [
|
"italian": [
|
||||||
"Commissioni di performance",
|
"Commissioni di performance",
|
||||||
"Commissioni legate al rendimento",
|
"Commissioni legate al rendimento",
|
||||||
|
"Commissioni di performance 1)",
|
||||||
|
"Commissioni legate al rendimento 2)",
|
||||||
"Commissioni d’incentivo"
|
"Commissioni d’incentivo"
|
||||||
],
|
],
|
||||||
"portuguese": [
|
"portuguese": [
|
||||||
"Comissão de desempenho",
|
"Comissão de desempenho",
|
||||||
"Custos de performance",
|
"Custos de performance",
|
||||||
|
"Comissão de desempenho 1)",
|
||||||
|
"Custos de performance 2)",
|
||||||
"Comissão de Gestão Variável"
|
"Comissão de Gestão Variável"
|
||||||
],
|
],
|
||||||
"estonian": [
|
"estonian": [
|
||||||
" Edukustasud aasta lõikes"
|
"Edukustasud aasta lõikes",
|
||||||
|
"Edukustasud aasta lõikes 1)"
|
||||||
],
|
],
|
||||||
"latvian": [
|
"latvian": [
|
||||||
"Gada prēmijas par sasniegtajiem rezultātiem"
|
"Gada prēmijas par sasniegtajiem rezultātiem",
|
||||||
|
"Gada prēmijas par sasniegtajiem rezultātiem 1)"
|
||||||
],
|
],
|
||||||
"Lithuanian": [
|
"Lithuanian": [
|
||||||
"Metinis mokestis už veiklos rezultatu"
|
"Metinis mokestis už veiklos rezultatu",
|
||||||
|
"Metinis mokestis už veiklos rezultatu 2)"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -57,8 +57,41 @@ class DataExtraction:
|
||||||
self.instructions_config = self.get_instructions_config()
|
self.instructions_config = self.get_instructions_config()
|
||||||
self.datapoint_level_config = self.get_datapoint_level()
|
self.datapoint_level_config = self.get_datapoint_level()
|
||||||
self.datapoint_name_config = self.get_datapoint_name()
|
self.datapoint_name_config = self.get_datapoint_name()
|
||||||
|
self.datapoint_reported_name_config, self.non_english_reported_name_config = \
|
||||||
|
self.get_datapoint_reported_name()
|
||||||
self.extract_way = extract_way
|
self.extract_way = extract_way
|
||||||
self.output_image_folder = output_image_folder
|
self.output_image_folder = output_image_folder
|
||||||
|
|
||||||
|
def get_datapoint_reported_name(self):
|
||||||
|
language_config_file = r"./configuration/language.json"
|
||||||
|
self.language_config = {}
|
||||||
|
with open(language_config_file, "r", encoding="utf-8") as file:
|
||||||
|
self.language_config = json.load(file)
|
||||||
|
|
||||||
|
self.language_id = self.document_mapping_info_df["Language"].iloc[0]
|
||||||
|
self.language = self.language_config.get(self.language_id, None)
|
||||||
|
|
||||||
|
datapoint_reported_name_config_file = r"./configuration/datapoint_reported_name.json"
|
||||||
|
all_datapoint_reported_name = {}
|
||||||
|
with open(datapoint_reported_name_config_file, "r", encoding="utf-8") as file:
|
||||||
|
all_datapoint_reported_name = json.load(file)
|
||||||
|
|
||||||
|
non_english_reported_name_config = {}
|
||||||
|
datapoint_reported_name_config = {}
|
||||||
|
common_language = "english"
|
||||||
|
for datapoint, language_reported_name in all_datapoint_reported_name.items():
|
||||||
|
reported_name_list = language_reported_name.get(common_language, [])
|
||||||
|
|
||||||
|
if self.language != "english":
|
||||||
|
reported_name_list.extend(language_reported_name.get(self.language, []))
|
||||||
|
non_english_reported_name_config[datapoint] = language_reported_name.get(self.language, [])
|
||||||
|
# remove duplicate reported name
|
||||||
|
reported_name_list = list(set(reported_name_list))
|
||||||
|
# sort the reported name
|
||||||
|
reported_name_list.sort()
|
||||||
|
datapoint_reported_name_config[datapoint] = reported_name_list
|
||||||
|
return datapoint_reported_name_config, non_english_reported_name_config
|
||||||
|
|
||||||
|
|
||||||
def get_provider_mapping(self):
|
def get_provider_mapping(self):
|
||||||
if len(self.document_mapping_info_df) == 0:
|
if len(self.document_mapping_info_df) == 0:
|
||||||
|
|
@ -628,13 +661,103 @@ class DataExtraction:
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
|
|
||||||
instructions.append("Datapoints Reported name:\n")
|
instructions.append("Datapoints Reported name:\n")
|
||||||
reported_name_info = self.instructions_config.get("reported_name", {})
|
instructions.append("Please look for relevant reported names and similar variations in the context.\n")
|
||||||
|
reported_name_info_in_instructions = self.instructions_config.get("reported_name", {})
|
||||||
for datapoint in datapoints:
|
for datapoint in datapoints:
|
||||||
reported_name = reported_name_info.get(datapoint, "")
|
reported_name_list = self.datapoint_reported_name_config.get(datapoint, [])
|
||||||
|
if len(reported_name_list) == 0:
|
||||||
|
reported_name = reported_name_info_in_instructions.get(datapoint, "")
|
||||||
|
else:
|
||||||
|
joined_reported_name = ", ".join(reported_name_list)
|
||||||
|
datapoint_name = datapoint
|
||||||
|
if datapoint_name == "performance_fee":
|
||||||
|
datapoint_name = "performance fees"
|
||||||
|
else:
|
||||||
|
datapoint_name = datapoint_name.upper()
|
||||||
|
reported_name = f"The {datapoint_name} reported name could be:\n{joined_reported_name}"
|
||||||
|
|
||||||
instructions.append(reported_name)
|
instructions.append(reported_name)
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
|
|
||||||
|
if self.language != "english":
|
||||||
|
"""
|
||||||
|
"multilingual_reported_name": {
|
||||||
|
"describe": "Please be careful to extract relevant data by different reported names from multilingual Context.",
|
||||||
|
"regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}",
|
||||||
|
"special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nValue is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}",
|
||||||
|
"value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"]
|
||||||
|
"fund_example": "Fund 1",
|
||||||
|
"share_example": "Share 1"
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
multilingual_reported_name_config = self.instructions_config.get("multilingual_reported_name", {})
|
||||||
|
describe = multilingual_reported_name_config.get("describe", "")
|
||||||
|
regular_example_template = multilingual_reported_name_config.get("regular_example_template", "")
|
||||||
|
special_example_template_none = multilingual_reported_name_config.get("special_example_template_none", "")
|
||||||
|
value_examples = multilingual_reported_name_config.get("value_examples", [])
|
||||||
|
fund_example = multilingual_reported_name_config.get("fund_example", "")
|
||||||
|
share_example = multilingual_reported_name_config.get("share_example", "")
|
||||||
|
instructions.append("Multilingual reported name:\n")
|
||||||
|
instructions.append(f"{describe}\n")
|
||||||
|
|
||||||
|
# set language the first char to be upper
|
||||||
|
language = self.language[0].upper() + self.language[1:]
|
||||||
|
for datapoint in datapoints:
|
||||||
|
mul_reported_name_list = self.non_english_reported_name_config.get(datapoint, [])
|
||||||
|
# shuffle the reported name list
|
||||||
|
mul_reported_name_list = list(set(mul_reported_name_list))
|
||||||
|
if len(mul_reported_name_list) == 0:
|
||||||
|
continue
|
||||||
|
datapoint_name = datapoint
|
||||||
|
if datapoint_name == "performance_fee":
|
||||||
|
datapoint_name = "performance fees"
|
||||||
|
else:
|
||||||
|
datapoint_name = datapoint_name.upper()
|
||||||
|
example_count = 1
|
||||||
|
none_value_example_count = 0
|
||||||
|
for mul_reported_name in mul_reported_name_list:
|
||||||
|
if datapoint in ["ter", "performance_fee"] and example_count == 3:
|
||||||
|
break
|
||||||
|
value = value_examples[example_count % len(value_examples)]
|
||||||
|
answer = {"fund name": fund_example,
|
||||||
|
"share name": share_example,
|
||||||
|
datapoint: float(value.replace(",", "."))}
|
||||||
|
# transfer answer to string
|
||||||
|
answer = json.dumps(answer, ensure_ascii=False)
|
||||||
|
example = regular_example_template.format(
|
||||||
|
datapoint=datapoint_name,
|
||||||
|
number=example_count,
|
||||||
|
language=language,
|
||||||
|
fund_name=fund_example,
|
||||||
|
share_name=share_example,
|
||||||
|
reported_name=mul_reported_name,
|
||||||
|
value=value,
|
||||||
|
answer=answer,
|
||||||
|
)
|
||||||
|
instructions.append(example)
|
||||||
|
instructions.append("\n")
|
||||||
|
instructions.append("\n")
|
||||||
|
|
||||||
|
example_count += 1
|
||||||
|
if len(mul_reported_name.split()) > 1:
|
||||||
|
if none_value_example_count != 2:
|
||||||
|
none_value_example = special_example_template_none.format(
|
||||||
|
datapoint=datapoint_name,
|
||||||
|
number=example_count,
|
||||||
|
language=language,
|
||||||
|
fund_name=fund_example,
|
||||||
|
share_name=share_example,
|
||||||
|
reported_name=mul_reported_name,
|
||||||
|
answer = json.dumps({}, ensure_ascii=False)
|
||||||
|
)
|
||||||
|
instructions.append(none_value_example)
|
||||||
|
instructions.append("\n")
|
||||||
|
instructions.append("\n")
|
||||||
|
example_count += 1
|
||||||
|
none_value_example_count += 1
|
||||||
|
|
||||||
|
instructions.append("\n")
|
||||||
instructions.append("Data business features:\n")
|
instructions.append("Data business features:\n")
|
||||||
data_business_features = self.instructions_config.get(
|
data_business_features = self.instructions_config.get(
|
||||||
"data_business_features", {}
|
"data_business_features", {}
|
||||||
|
|
|
||||||
|
|
@ -231,9 +231,12 @@ class DataMapping:
|
||||||
excel_data_file = os.path.join(
|
excel_data_file = os.path.join(
|
||||||
self.output_data_excel_folder, f"{self.doc_id}.xlsx"
|
self.output_data_excel_folder, f"{self.doc_id}.xlsx"
|
||||||
)
|
)
|
||||||
with pd.ExcelWriter(excel_data_file) as writer:
|
try:
|
||||||
mapping_data_df.to_excel(writer, sheet_name="mapping_data", index=False)
|
with pd.ExcelWriter(excel_data_file) as writer:
|
||||||
extract_data_df.to_excel(writer, sheet_name="extract_data", index=False)
|
mapping_data_df.to_excel(writer, sheet_name="mapping_data", index=False)
|
||||||
|
extract_data_df.to_excel(writer, sheet_name="extract_data", index=False)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to save excel file: {e}")
|
||||||
|
|
||||||
return mapped_data_list
|
return mapped_data_list
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,14 @@
|
||||||
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, Annualisierte TER in % (Mit Gebührenverzicht), Annualised TER % (with fee waiver), kostenquote, Gesamt kostenquote, etc.",
|
"ter": "The TER reported name could be:\nTER, Total Expense Ratio, Total expense ratio as a percentage, Total Fund Charge, Gross Expense Ratio, All in fee, Total Net Expense Ratio, Weighted Average Expense Ratio, Synthetic total Expense Ratio, Annualised TER including performance fees, Capped Expense Ratio, TER (en %) (with performance), Net TER, Total Expense Ratio in Prozent, Annualisierte TER in % (Mit Gebührenverzicht), Annualised TER % (with fee waiver), kostenquote, Gesamt kostenquote, etc.",
|
||||||
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
|
"performance_fee": "The performance fees reported name could be:\nperformance fees, performance fees ratio, Performance, etc."
|
||||||
},
|
},
|
||||||
|
"multilingual_reported_name": {
|
||||||
|
"describe": "Please be careful to extract relevant data from multilingual Context.",
|
||||||
|
"regular_example_template": "{datapoint} Example {number}:\nLanguage: {language}\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name}\n{value}\n---Context End-----\nAnswer: {answer}",
|
||||||
|
"special_example_template_none": "{datapoint} Example {number}:\nLanguage: {language}\nIf value is belong to \"-, *, **, N/A, N/A%, N/A %, NONE\", ignore it\n---Context Start-----\n{fund_name}\n{share_name}\n{reported_name} 2)\n-\n---Context End-----\nAnswer: {answer}",
|
||||||
|
"value_examples": ["1,98", "3.25", "2.16", "1,73", "4,53"],
|
||||||
|
"fund_example": "Fund 1",
|
||||||
|
"share_example": "Share 1"
|
||||||
|
},
|
||||||
"data_business_features": {
|
"data_business_features": {
|
||||||
"common": [
|
"common": [
|
||||||
"General rules:",
|
"General rules:",
|
||||||
|
|
|
||||||
54
main.py
54
main.py
|
|
@ -791,7 +791,7 @@ def get_sub_metrics(data_df: pd.DataFrame, data_point: str) -> dict:
|
||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
test_calculate_metrics()
|
# test_calculate_metrics()
|
||||||
# test_replace_abbrevation()
|
# test_replace_abbrevation()
|
||||||
# test_translate_pdf()
|
# test_translate_pdf()
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
|
|
@ -1187,32 +1187,48 @@ if __name__ == "__main__":
|
||||||
"546046730",
|
"546046730",
|
||||||
"546919329"
|
"546919329"
|
||||||
]
|
]
|
||||||
# special_doc_id_list = ["507928179"]
|
special_doc_id_list = ["514636958",
|
||||||
|
"514636959",
|
||||||
|
"514636985",
|
||||||
|
"514636988",
|
||||||
|
"514636990",
|
||||||
|
"514636994",
|
||||||
|
"514636957",
|
||||||
|
"514636954",
|
||||||
|
"514636953",
|
||||||
|
"514636952",
|
||||||
|
"501600549",
|
||||||
|
"501600429",
|
||||||
|
"501380553",
|
||||||
|
"501380497",
|
||||||
|
"514636959",
|
||||||
|
"508981020"]
|
||||||
|
special_doc_id_list = ["514636993"]
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = False
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
# pdf_folder = r"/data/emea_ar/small_pdf/"
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
# for extract_way in extract_ways:
|
for extract_way in extract_ways:
|
||||||
# batch_start_job(
|
batch_start_job(
|
||||||
# pdf_folder,
|
pdf_folder,
|
||||||
# page_filter_ground_truth_file,
|
page_filter_ground_truth_file,
|
||||||
# output_extract_data_child_folder,
|
output_extract_data_child_folder,
|
||||||
# output_mapping_child_folder,
|
output_mapping_child_folder,
|
||||||
# output_extract_data_total_folder,
|
output_extract_data_total_folder,
|
||||||
# output_mapping_total_folder,
|
output_mapping_total_folder,
|
||||||
# extract_way,
|
extract_way,
|
||||||
# special_doc_id_list,
|
special_doc_id_list,
|
||||||
# re_run_extract_data,
|
re_run_extract_data,
|
||||||
# re_run_mapping_data,
|
re_run_mapping_data,
|
||||||
# force_save_total_data=force_save_total_data,
|
force_save_total_data=force_save_total_data,
|
||||||
# calculate_metrics=calculate_metrics,
|
calculate_metrics=calculate_metrics,
|
||||||
# )
|
)
|
||||||
|
|
||||||
# test_data_extraction_metrics()
|
# test_data_extraction_metrics()
|
||||||
# test_mapping_raw_name()
|
# test_mapping_raw_name()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue