support output data point reported name

This commit is contained in:
Blade He 2024-10-29 16:47:45 -05:00
parent 9d453c9fae
commit 2645d528b1
3 changed files with 31 additions and 32 deletions

View File

@ -706,6 +706,8 @@ class DataExtraction:
share_level_config = output_requirement.get("share_level", {})
example_list = []
dp_reported_name_config = output_requirement.get("dp_reported_name", {})
dp_reported_name = {}
for datapoint in datapoints:
investment_level = self.datapoint_level_config.get(datapoint, "")
if investment_level == "fund_level":
@ -720,6 +722,7 @@ class DataExtraction:
share_datapoint_value_example[datapoint] = share_level_config.get(
f"{datapoint}_value", []
)
dp_reported_name[datapoint] = dp_reported_name_config.get(datapoint, "")
share_datapoint_list = list(share_datapoint_value_example.keys())
instructions.append(f"Example:\n")
@ -739,7 +742,7 @@ class DataExtraction:
if index < len(share_datapoint_values):
example_dict[share_datapoint] = share_datapoint_values[index]
example_list.append(example_dict)
example_data = {"data": example_list}
example_data = {"data": example_list, "dp_reported_name": dp_reported_name}
instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4))
instructions.append("\n")
instructions.append("\n")

View File

@ -183,12 +183,13 @@
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
"Please also output the data point reported name in context.",
"Example:",
"-----Example Start-----",
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
"-----Example End-----",
"Output:",
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}",
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}], \"dp_reported_name\": {\"ter\": \"TER\", \"performance_fee\": \"Performance\nfees\"}}",
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
"The output should be JSON format, the format is like below example(s):"
],
@ -221,6 +222,12 @@
-0.15,
0.11
]
},
"dp_reported_name" : {
"tor": "TOR",
"ogc": "OGC",
"ter": "TER",
"performance_fee": "Performance fees"
}
},
"end": [

49
main.py
View File

@ -99,18 +99,22 @@ class EMEA_AR_Parsing:
data_from_gpt = json.load(f)
return data_from_gpt
data_extraction = DataExtraction(
self.doc_id,
self.pdf_file,
self.output_extract_data_folder,
self.page_text_dict,
self.datapoint_page_info,
self.datapoints,
self.document_mapping_info_df,
extract_way=self.extract_way,
output_image_folder=self.output_extract_image_folder,
)
data_from_gpt = data_extraction.extract_data()
try:
data_extraction = DataExtraction(
self.doc_id,
self.pdf_file,
self.output_extract_data_folder,
self.page_text_dict,
self.datapoint_page_info,
self.datapoints,
self.document_mapping_info_df,
extract_way=self.extract_way,
output_image_folder=self.output_extract_image_folder,
)
data_from_gpt = data_extraction.extract_data()
except Exception as e:
logger.error(f"Error: {e}")
data_from_gpt = {"data": []}
return data_from_gpt
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
@ -888,7 +892,7 @@ if __name__ == "__main__":
"532179676",
"534300608",
"539233950",
"533727908",
# "533727908",
"532438414",
"533681744",
"537654645",
@ -901,27 +905,12 @@ if __name__ == "__main__":
"536343790"
]
special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["425595958",
# "451063582",
# "451878128",
# "466580448",
# "481482392",
# "492029971",
# "508704368",
# "510300817",
# "512745032",
# "514213638",
# "527525440",
# "532422548",
# "532998065",
# "534535767",
# "536344026",
# "540307575"]
special_doc_id_list = ["534547266"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True
re_run_mapping_data = True
force_save_total_data = True
force_save_total_data = False
calculate_metrics = False
extract_ways = ["text"]