instructions: explicitly announce, not to collect data which value with -, *, **, N/A, N/A%, N/A %, NONE

This commit is contained in:
Blade He 2024-09-20 10:26:18 -05:00
parent c4985ac75f
commit 40bcce4404
2 changed files with 8 additions and 3 deletions

View File

@ -129,7 +129,12 @@
"Only output the data point which with relevant value.", "Only output the data point which with relevant value.",
"Don't ignore the data point which with negative value, e.g. -0.12, -1.13", "Don't ignore the data point which with negative value, e.g. -0.12, -1.13",
"Don't ignore the data point which with explicit zero value, e.g. 0, 0.00", "Don't ignore the data point which with explicit zero value, e.g. 0, 0.00",
"Ignore the data point which value with -, *, **, N/A, N/A%, N/A %, NONE, etc.", "Don't extract data which values are -, *, **, N/A, N/A%, N/A %, NONE, it means the value should be NULL, please skip them.",
"Example:",
"Context:",
"Sub-Funds\nClass of shares\nCurrency\nTER\nPerformance\nfees\nSwiss Life Funds (LUX) Bond Emerging Markets Corporates\nAM - Shares CHF hedged - Capitalisation\nCHF\n0.23%\n-\nAM - Shares EUR hedged - Capitalisation\nEUR\n0.23%\n0.00%\n",
"Output:",
"{\"data\": [{\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares CHF hedged - Capitalisation\", \"ter\": 0.23}, {\"fund name\": \"Swiss Life Funds (LUX) Bond Emerging Markets Corporates\", \"share name\": \"AM - Shares EUR hedged - Capitalisation\", \"ter\": 0.23, \"performance_fee\": 0}]}",
"Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.", "Fund level data: (\"fund name\" and \"TOR\") and share level data: (\"fund name\", \"share name\", \"ter\", \"performance fees\", \"ogc\") should be output separately.",
"The output should be JSON format, the format is like below example(s):" "The output should be JSON format, the format is like below example(s):"
], ],

View File

@ -577,13 +577,13 @@ if __name__ == "__main__":
# extract_way, # extract_way,
# re_run_extract_data) # re_run_extract_data)
special_doc_id_list = [] special_doc_id_list = ["349679479"]
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = False force_save_total_data = False
extract_ways = ["text", "image"] extract_ways = ["text"]
for extract_way in extract_ways: for extract_way in extract_ways:
batch_start_job( batch_start_job(
pdf_folder, pdf_folder,