support output data point reported name
This commit is contained in:
parent
9d453c9fae
commit
2645d528b1
|
|
@ -706,6 +706,8 @@ class DataExtraction:
|
||||||
share_level_config = output_requirement.get("share_level", {})
|
share_level_config = output_requirement.get("share_level", {})
|
||||||
|
|
||||||
example_list = []
|
example_list = []
|
||||||
|
dp_reported_name_config = output_requirement.get("dp_reported_name", {})
|
||||||
|
dp_reported_name = {}
|
||||||
for datapoint in datapoints:
|
for datapoint in datapoints:
|
||||||
investment_level = self.datapoint_level_config.get(datapoint, "")
|
investment_level = self.datapoint_level_config.get(datapoint, "")
|
||||||
if investment_level == "fund_level":
|
if investment_level == "fund_level":
|
||||||
|
|
@ -720,6 +722,7 @@ class DataExtraction:
|
||||||
share_datapoint_value_example[datapoint] = share_level_config.get(
|
share_datapoint_value_example[datapoint] = share_level_config.get(
|
||||||
f"{datapoint}_value", []
|
f"{datapoint}_value", []
|
||||||
)
|
)
|
||||||
|
dp_reported_name[datapoint] = dp_reported_name_config.get(datapoint, "")
|
||||||
|
|
||||||
share_datapoint_list = list(share_datapoint_value_example.keys())
|
share_datapoint_list = list(share_datapoint_value_example.keys())
|
||||||
instructions.append(f"Example:\n")
|
instructions.append(f"Example:\n")
|
||||||
|
|
@ -739,7 +742,7 @@ class DataExtraction:
|
||||||
if index < len(share_datapoint_values):
|
if index < len(share_datapoint_values):
|
||||||
example_dict[share_datapoint] = share_datapoint_values[index]
|
example_dict[share_datapoint] = share_datapoint_values[index]
|
||||||
example_list.append(example_dict)
|
example_list.append(example_dict)
|
||||||
example_data = {"data": example_list}
|
example_data = {"data": example_list, "dp_reported_name": dp_reported_name}
|
||||||
instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4))
|
instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4))
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
instructions.append("\n")
|
instructions.append("\n")
|
||||||
|
|
|
||||||
|
|
@ -183,12 +183,13 @@
|
||||||
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
|
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
|
||||||
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
|
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
|
||||||
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
|
"Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
|
||||||
|
"Please also output the data point reported name in context.",
|
||||||
"Example:",
|
"Example:",
|
||||||
"-----Example Start-----",
|
"-----Example Start-----",
|
||||||
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
|
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
|
||||||
"-----Example End-----",
|
"-----Example End-----",
|
||||||
"Output:",
|
"Output:",
|
||||||
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}",
|
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}], \"dp_reported_name\": {\"ter\": \"TER\", \"performance_fee\": \"Performance\nfees\"}}",
|
||||||
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
|
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
|
||||||
"The output should be JSON format, the format is like below example(s):"
|
"The output should be JSON format, the format is like below example(s):"
|
||||||
],
|
],
|
||||||
|
|
@ -221,6 +222,12 @@
|
||||||
-0.15,
|
-0.15,
|
||||||
0.11
|
0.11
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
"dp_reported_name" : {
|
||||||
|
"tor": "TOR",
|
||||||
|
"ogc": "OGC",
|
||||||
|
"ter": "TER",
|
||||||
|
"performance_fee": "Performance fees"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"end": [
|
"end": [
|
||||||
|
|
|
||||||
25
main.py
25
main.py
|
|
@ -99,6 +99,7 @@ class EMEA_AR_Parsing:
|
||||||
data_from_gpt = json.load(f)
|
data_from_gpt = json.load(f)
|
||||||
return data_from_gpt
|
return data_from_gpt
|
||||||
|
|
||||||
|
try:
|
||||||
data_extraction = DataExtraction(
|
data_extraction = DataExtraction(
|
||||||
self.doc_id,
|
self.doc_id,
|
||||||
self.pdf_file,
|
self.pdf_file,
|
||||||
|
|
@ -111,6 +112,9 @@ class EMEA_AR_Parsing:
|
||||||
output_image_folder=self.output_extract_image_folder,
|
output_image_folder=self.output_extract_image_folder,
|
||||||
)
|
)
|
||||||
data_from_gpt = data_extraction.extract_data()
|
data_from_gpt = data_extraction.extract_data()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error: {e}")
|
||||||
|
data_from_gpt = {"data": []}
|
||||||
return data_from_gpt
|
return data_from_gpt
|
||||||
|
|
||||||
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list:
|
||||||
|
|
@ -888,7 +892,7 @@ if __name__ == "__main__":
|
||||||
"532179676",
|
"532179676",
|
||||||
"534300608",
|
"534300608",
|
||||||
"539233950",
|
"539233950",
|
||||||
"533727908",
|
# "533727908",
|
||||||
"532438414",
|
"532438414",
|
||||||
"533681744",
|
"533681744",
|
||||||
"537654645",
|
"537654645",
|
||||||
|
|
@ -901,27 +905,12 @@ if __name__ == "__main__":
|
||||||
"536343790"
|
"536343790"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["425595958",
|
special_doc_id_list = ["534547266"]
|
||||||
# "451063582",
|
|
||||||
# "451878128",
|
|
||||||
# "466580448",
|
|
||||||
# "481482392",
|
|
||||||
# "492029971",
|
|
||||||
# "508704368",
|
|
||||||
# "510300817",
|
|
||||||
# "512745032",
|
|
||||||
# "514213638",
|
|
||||||
# "527525440",
|
|
||||||
# "532422548",
|
|
||||||
# "532998065",
|
|
||||||
# "534535767",
|
|
||||||
# "536344026",
|
|
||||||
# "540307575"]
|
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue