From 40bcce440487b87c42f6a78df55e65de2e9be59e Mon Sep 17 00:00:00 2001 From: Blade He Date: Fri, 20 Sep 2024 10:26:18 -0500 Subject: [PATCH] instructions: explicitly announce, not to collect data which value with -, *, **, N/A, N/A%, N/A %, NONE --- instructions/data_extraction_prompts_config.json | 7 ++++++- main.py | 4 ++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index e012faf..728d6ff 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -129,7 +129,12 @@ "Only output the data point which with relevant value.", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", - "Ignore the data point which value with -, *, **, N/A, N/A%, N/A %, NONE, etc.", + "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.", + "Example:", + "Context:", + "Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n", + "Output:", + "{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}", "Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.", "The output should be JSON format, the format is like below example(s):" ], diff --git a/main.py b/main.py index 2285f79..4db74d9 100644 --- a/main.py +++ b/main.py @@ -577,13 +577,13 @@ if __name__ == "__main__": # extract_way, # re_run_extract_data) - special_doc_id_list = [] + special_doc_id_list = ["349679479"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_mapping_data = True force_save_total_data = False - extract_ways = ["text", "image"] + extract_ways = ["text"] for extract_way in extract_ways: batch_start_job( pdf_folder,