diff --git a/core/data_extraction.py b/core/data_extraction.py index 439fedf..fe52048 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -706,6 +706,8 @@ class DataExtraction: share_level_config = output_requirement.get("share_level", {}) example_list = [] + dp_reported_name_config = output_requirement.get("dp_reported_name", {}) + dp_reported_name = {} for datapoint in datapoints: investment_level = self.datapoint_level_config.get(datapoint, "") if investment_level == "fund_level": @@ -720,6 +722,7 @@ class DataExtraction: share_datapoint_value_example[datapoint] = share_level_config.get( f"{datapoint}_value", [] ) + dp_reported_name[datapoint] = dp_reported_name_config.get(datapoint, "") share_datapoint_list = list(share_datapoint_value_example.keys()) instructions.append(f"Example:\n") @@ -739,7 +742,7 @@ class DataExtraction: if index < len(share_datapoint_values): example_dict[share_datapoint] = share_datapoint_values[index] example_list.append(example_dict) - example_data = {"data": example_list} + example_data = {"data": example_list, "dp_reported_name": dp_reported_name} instructions.append(json.dumps(example_data, ensure_ascii=False, indent=4)) instructions.append("\n") instructions.append("\n") diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 5b19973..5a09b09 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -183,12 +183,13 @@ "Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", + "Please also output the data point reported name in context.", "Example:", "-----Example Start-----", "Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n", "-----Example End-----", "Output:", - "{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}", + "{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}], \"dp_reported_name\": {\"ter\": \"TER\", \"performance_fee\": \"Performance\nfees\"}}", "Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.", "The output should be JSON format, the format is like below example(s):" ], @@ -221,6 +222,12 @@ -0.15, 0.11 ] + }, + "dp_reported_name" : { + "tor": "TOR", + "ogc": "OGC", + "ter": "TER", + "performance_fee": "Performance fees" } }, "end": [ diff --git a/main.py b/main.py index b195e80..8aa0ab8 100644 --- a/main.py +++ b/main.py @@ -99,18 +99,22 @@ class EMEA_AR_Parsing: data_from_gpt = json.load(f) return data_from_gpt - data_extraction = DataExtraction( - self.doc_id, - self.pdf_file, - self.output_extract_data_folder, - self.page_text_dict, - self.datapoint_page_info, - self.datapoints, - self.document_mapping_info_df, - extract_way=self.extract_way, - output_image_folder=self.output_extract_image_folder, - ) - data_from_gpt = data_extraction.extract_data() + try: + data_extraction = DataExtraction( + self.doc_id, + self.pdf_file, + self.output_extract_data_folder, + self.page_text_dict, + self.datapoint_page_info, + self.datapoints, + self.document_mapping_info_df, + extract_way=self.extract_way, + output_image_folder=self.output_extract_image_folder, + ) + data_from_gpt = data_extraction.extract_data() + except Exception as e: + logger.error(f"Error: {e}") + data_from_gpt = {"data": []} return data_from_gpt def mapping_data(self, data_from_gpt: list, re_run: bool = False) -> list: @@ -888,7 +892,7 @@ if __name__ == "__main__": "532179676", "534300608", "539233950", - "533727908", + # "533727908", "532438414", "533681744", "537654645", @@ -901,27 +905,12 @@ if __name__ == "__main__": "536343790" ] special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["425595958", - # "451063582", - # "451878128", - # "466580448", - # "481482392", - # "492029971", - # "508704368", - # "510300817", - # "512745032", - # "514213638", - # "527525440", - # "532422548", - # "532998065", - # "534535767", - # "536344026", - # "540307575"] + special_doc_id_list = ["534547266"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"]