support output data point reported name
This commit is contained in:
parent
9d453c9fae
commit
2645d528b1
|
|
@ -706,6 +706,8 @@ class DataExtraction:
|
|||
share_level_config = output_requirement.get("share_level", {})
|
||||
|
||||
example_list = []
|
||||
dp_reported_name_config = output_requirement.get("dp_reported_name", {})
|
||||
dp_reported_name = {}
|
||||
for datapoint in datapoints:
|
||||
investment_level = self.datapoint_level_config.get(datapoint, "")
|
||||
if investment_level == "fund_level":
|
||||
|
|
@ -720,6 +722,7 @@ class DataExtraction:
|
|||
share_datapoint_value_example[datapoint] = share_level_config.get(
|
||||
f"{datapoint}_value", []
|
||||
)
|
||||
dp_reported_name[datapoint] = dp_reported_name_config.get(datapoint, "")
|
||||
|
||||
share_datapoint_list = list(share_datapoint_value_example.keys())
|
||||
instructions.append(f"Example:\n")
|
||||
|
|
@ -739,7 +742,7 @@ class DataExtraction:
|
|||
if index < len(share_datapoint_values):
|
||||
example_dict[share_datapoint] = share_datapoint_values[index]
|
||||
example_list.append(example_dict)
|
||||
example_data = {"data": example_list}
|
||||
example_data = {"data": example_list, "dp_reported_name": dp_reported_name}
|
||||
instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4))
|
||||
instructions.append("\n")
|
||||
instructions.append("\n")
|
||||
|
|
|
|||
|
|
@ -183,12 +183,13 @@
|
|||
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
|
||||
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
|
||||
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
|
||||
"Please also output the data point reported name in context.",
|
||||
"Example:",
|
||||
"-----Example Start-----",
|
||||
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
|
||||
"-----Example End-----",
|
||||
"Output:",
|
||||
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}",
|
||||
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}], \"dp_reported_name\": {\"ter\": \"TER\", \"performance_fee\": \"Performance\nfees\"}}",
|
||||
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
|
||||
"The output should be JSON format, the format is like below example(s):"
|
||||
],
|
||||
|
|
@ -221,6 +222,12 @@
|
|||
-0.15,
|
||||
0.11
|
||||
]
|
||||
},
|
||||
"dp_reported_name" : {
|
||||
"tor": "TOR",
|
||||
"ogc": "OGC",
|
||||
"ter": "TER",
|
||||
"performance_fee": "Performance fees"
|
||||
}
|
||||
},
|
||||
"end": [
|
||||
|
|
|
|||
49
main.py
49
main.py
|
|
@ -99,18 +99,22 @@ class EMEA_AR_Parsing:
|
|||
data_from_gpt = json.load(f)
|
||||
return data_from_gpt
|
||||
|
||||
data_extraction = DataExtraction(
|
||||
self.doc_id,
|
||||
self.pdf_file,
|
||||
self.output_extract_data_folder,
|
||||
self.page_text_dict,
|
||||
self.datapoint_page_info,
|
||||
self.datapoints,
|
||||
self.document_mapping_info_df,
|
||||
extract_way=self.extract_way,
|
||||
output_image_folder=self.output_extract_image_folder,
|
||||
)
|
||||
data_from_gpt = data_extraction.extract_data()
|
||||
try:
|
||||
data_extraction = DataExtraction(
|
||||
self.doc_id,
|
||||
self.pdf_file,
|
||||
self.output_extract_data_folder,
|
||||
self.page_text_dict,
|
||||
self.datapoint_page_info,
|
||||
self.datapoints,
|
||||
self.document_mapping_info_df,
|
||||
extract_way=self.extract_way,
|
||||
output_image_folder=self.output_extract_image_folder,
|
||||
)
|
||||
data_from_gpt = data_extraction.extract_data()
|
||||
except Exception as e:
|
||||
logger.error(f"Error: {e}")
|
||||
data_from_gpt = {"data": []}
|
||||
return data_from_gpt
|
||||
|
||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||
|
|
@ -888,7 +892,7 @@ if __name__ == "__main__":
|
|||
"532179676",
|
||||
"534300608",
|
||||
"539233950",
|
||||
"533727908",
|
||||
# "533727908",
|
||||
"532438414",
|
||||
"533681744",
|
||||
"537654645",
|
||||
|
|
@ -901,27 +905,12 @@ if __name__ == "__main__":
|
|||
"536343790"
|
||||
]
|
||||
special_doc_id_list = check_db_mapping_doc_id_list
|
||||
# special_doc_id_list = ["425595958",
|
||||
# "451063582",
|
||||
# "451878128",
|
||||
# "466580448",
|
||||
# "481482392",
|
||||
# "492029971",
|
||||
# "508704368",
|
||||
# "510300817",
|
||||
# "512745032",
|
||||
# "514213638",
|
||||
# "527525440",
|
||||
# "532422548",
|
||||
# "532998065",
|
||||
# "534535767",
|
||||
# "536344026",
|
||||
# "540307575"]
|
||||
special_doc_id_list = ["534547266"]
|
||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
force_save_total_data = False
|
||||
calculate_metrics = False
|
||||
|
||||
extract_ways = ["text"]
|
||||
|
|
|
|||
Loading…
Reference in New Issue