diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index f03c51a..f768996 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -441,7 +441,17 @@ "b. This example mentioned share classes, please output according to share class.", "The output should be", "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", - "D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0" + "D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", + "E. If for performacen fee specifically Nil is written in the value then return NULL('') for the same", + "---Example Start---", + "Vanguard Investor Short Term Fixed Interest Fund PLUS Performance fees Nil \nAnd, you will be charged or have deducted \nfrom your investment $0 in performance fees \neach year.", + "---Example End---", + "a. For this example, as Performance fee mentioned as Nil so return NULL('') as performance fee datapoint value.", + "F. If you found Example in the header of the table then ignore that table and do not extract value from the same table", + "---Example Start---", + "Example - Vanguard Investor Short Term Fixed Interest Fund \nContribution fees Nil \nFor every additional $5,000 you put in, you \nwill be charged $0. \nPLUS Management fees and \ncosts 3,4 \n0.19% p.a. of the NAV of the Fund \nAnd, for every $500,000 you have in the \nFund, you will be charged or have deducted \nfrom your investment $950 each year. \nPLUS Performance fees Nil \nAnd, you will be charged or have deducted \nfrom your investment $0 in performance fees \neach year.", + "---Example End---", + "a. For this example, you have Example keyword in the header so you should not extract any datapoint values Like performance_fee_costs, management fee etc." ], "minimum_initial_investment": [ "Minimum initial investment is fund level data, belong to integer number, the value examples are 100, 1,000, 5,000, 10,000, etc.", diff --git a/main.py b/main.py index 389aa18..a79f31d 100644 --- a/main.py +++ b/main.py @@ -19,7 +19,7 @@ from core.data_extraction import DataExtraction from core.data_mapping import DataMapping from core.auz_nz.hybrid_solution_script import api_for_fund_matching_call from core.metrics import Metrics - +import certifi class EMEA_AR_Parsing: def __init__( @@ -1507,6 +1507,8 @@ if __name__ == "__main__": # output_data_file_path = os.path.join(merged_total_data_folder, "merged_" + data_file_base_name) # merge_output_data_aus_prospectus(data_file_path, document_mapping_file_path, output_data_file_path) + os.environ["SSL_CERT_FILE"] = certifi.where() + doc_source = "aus_prospectus" sample_document_list_folder: str = r'./sample_documents/' document_list_file: str = "aus_prospectus_29_documents_sample.txt" @@ -1531,7 +1533,7 @@ if __name__ == "__main__": # doc_source = "emea_ar" if doc_source == "aus_prospectus": document_sample_file = ( - r"./sample_documents/aus_prospectus_46_documents_sample.txt" + r"./sample_documents\special_cases.txt" ) with open(document_sample_file, "r", encoding="utf-8") as f: special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] diff --git a/utils/gpt_utils.py b/utils/gpt_utils.py index 7f51995..964f89b 100644 --- a/utils/gpt_utils.py +++ b/utils/gpt_utils.py @@ -6,6 +6,7 @@ import os from time import sleep import base64 import dotenv +import httpx # loads .env file with your OPENAI_API_KEY dotenv.load_dotenv() @@ -74,9 +75,9 @@ def chat( ): if not engine.startswith("gpt-4o"): max_tokens = 4096 - + client = AzureOpenAI( - azure_endpoint=azure_endpoint, api_key=api_key, api_version=api_version + azure_endpoint=azure_endpoint, api_key=api_key, api_version=api_version, http_client=httpx.Client(verify=False) ) if (