support output data point reported name

This commit is contained in:
Blade He 2024-10-29 16:47:45 -05:00
parent 9d453c9fae
commit 2645d528b1
3 changed files with 31 additions and 32 deletions

View File

@ -706,6 +706,8 @@ class DataExtraction:
share_level_config = output_requirement.get("share_level", {}) share_level_config = output_requirement.get("share_level", {})
example_list = [] example_list = []
dp_reported_name_config = output_requirement.get("dp_reported_name", {})
dp_reported_name = {}
for datapoint in datapoints: for datapoint in datapoints:
investment_level = self.datapoint_level_config.get(datapoint, "") investment_level = self.datapoint_level_config.get(datapoint, "")
if investment_level == "fund_level": if investment_level == "fund_level":
@ -720,6 +722,7 @@ class DataExtraction:
share_datapoint_value_example[datapoint] = share_level_config.get( share_datapoint_value_example[datapoint] = share_level_config.get(
f"{datapoint}_value", [] f"{datapoint}_value", []
) )
dp_reported_name[datapoint] = dp_reported_name_config.get(datapoint, "")
share_datapoint_list = list(share_datapoint_value_example.keys()) share_datapoint_list = list(share_datapoint_value_example.keys())
instructions.append(f"Example:\n") instructions.append(f"Example:\n")
@ -739,7 +742,7 @@ class DataExtraction:
if index < len(share_datapoint_values): if index < len(share_datapoint_values):
example_dict[share_datapoint] = share_datapoint_values[index] example_dict[share_datapoint] = share_datapoint_values[index]
example_list.append(example_dict) example_list.append(example_dict)
example_data = {"data": example_list} example_data = {"data": example_list, "dp_reported_name": dp_reported_name}
instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4)) instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4))
instructions.append("\n") instructions.append("\n")
instructions.append("\n") instructions.append("\n")

View File

@ -183,12 +183,13 @@
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
"Please also output the data point reported name in context.",
"Example:", "Example:",
"-----Example Start-----", "-----Example Start-----",
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n", "Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
"-----Example End-----", "-----Example End-----",
"Output:", "Output:",
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}", "{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}], \"dp_reported_name\": {\"ter\": \"TER\", \"performance_fee\": \"Performance\nfees\"}}",
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.", "Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
"The output should be JSON format, the format is like below example(s):" "The output should be JSON format, the format is like below example(s):"
], ],
@ -221,6 +222,12 @@
-0.15, -0.15,
0.11 0.11
] ]
},
"dp_reported_name" : {
"tor": "TOR",
"ogc": "OGC",
"ter": "TER",
"performance_fee": "Performance fees"
} }
}, },
"end": [ "end": [

49
main.py
View File

@ -99,18 +99,22 @@ class EMEA_AR_Parsing:
data_from_gpt = json.load(f) data_from_gpt = json.load(f)
return data_from_gpt return data_from_gpt
data_extraction = DataExtraction( try:
self.doc_id, data_extraction = DataExtraction(
self.pdf_file, self.doc_id,
self.output_extract_data_folder, self.pdf_file,
self.page_text_dict, self.output_extract_data_folder,
self.datapoint_page_info, self.page_text_dict,
self.datapoints, self.datapoint_page_info,
self.document_mapping_info_df, self.datapoints,
extract_way=self.extract_way, self.document_mapping_info_df,
output_image_folder=self.output_extract_image_folder, extract_way=self.extract_way,
) output_image_folder=self.output_extract_image_folder,
data_from_gpt = data_extraction.extract_data() )
data_from_gpt = data_extraction.extract_data()
except Exception as e:
logger.error(f"Error: {e}")
data_from_gpt = {"data": []}
return data_from_gpt return data_from_gpt
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
@ -888,7 +892,7 @@ if __name__ == "__main__":
"532179676", "532179676",
"534300608", "534300608",
"539233950", "539233950",
"533727908", # "533727908",
"532438414", "532438414",
"533681744", "533681744",
"537654645", "537654645",
@ -901,27 +905,12 @@ if __name__ == "__main__":
"536343790" "536343790"
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["425595958", special_doc_id_list = ["534547266"]
# "451063582",
# "451878128",
# "466580448",
# "481482392",
# "492029971",
# "508704368",
# "510300817",
# "512745032",
# "514213638",
# "527525440",
# "532422548",
# "532998065",
# "534535767",
# "536344026",
# "540307575"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]