diff --git a/calc_metrics.py b/calc_metrics.py index 6d4384b..3d521f8 100644 --- a/calc_metrics.py +++ b/calc_metrics.py @@ -896,6 +896,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") if is_for_all: verify_file_name = f"{verify_file_name}_all" + if zero_equal_none: + verify_file_name = f"{verify_file_name}_zero_equal_none" metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" output_file = os.path.join(output_folder, metrics_file_name) with pd.ExcelWriter(output_file) as writer: @@ -1426,7 +1428,7 @@ if __name__ == "__main__": audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_data_sheet: str = "Sheet1" - verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged_mapping_data_info_46_documents_by_text.xlsx" + verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx" verify_data_sheet: str = "total_mapping_data" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" verify_document_list_file_list = [None, diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index ec0b62b..bd149b6 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -237,6 +237,11 @@ "---Example 3 End---", "The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:", "{\"data\": []}", + "---Example 4 Start---", + "Type of fee or cost Amount 2 How and when paid \nOngoing annual fees and costs 3 \nManagement fees and costs \n0.82% to 1.22% p.a. (estimated) \n", + "---Example 4 End---", + "The relevant values: 0.82 and 1.22, are in the range, should ignore, so the output should be:", + "{\"data\": []}", "\n", "H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.", "---Example 1 Start---", @@ -260,6 +265,8 @@ "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", + "If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.", + "If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.", "So the output should be:", "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "\n", @@ -356,13 +363,37 @@ "buy_spread": [ "A. Exclude reported name", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", - "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)", + "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), ", + "Estimated transaction costs offset by buy/sell spreads (% pa), ", + "---Example Start---", + "Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n", + "---Example End---", + "The data should be excluded, the output should be:", + "{\"data\": []}", "B. Simple case with simple table structure:", "---Example 1 Start---", "Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n", "---Example 1 End---", "The output should be:", - "{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" + "{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "\n", + "---Example 2 Start---", + "Fund name \nManagement fees \nand costs (% p.a.) \n1 \nTransaction costs \n(% p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nEveryday Investing Balanced Fund 0.35 0.05 0.00\n", + "---Example 2 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Everyday Investing Balanced Fund\", \"share name\": \"Everyday Investing Balanced Fund\", \"management_fee_and_costs\": 0.35, \"management_fee\": 0.35, \"buy_spread\": 0, \"sell_spread\": 0}]}", + "\n", + "---Example 3 Start---", + "Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nAUSTRALIAN SHARE \nFirst Sentier Australian Share Fund 0.96% 0.10\nFirst Sentier Concentrated Australian \nShare Fund 0.96% 0.10\nFirst Sentier Imputation Fund 0.97% 0.15\nAUSTRALIAN SHARE – SMALL COMPANIES \nFirst Sentier Australian Small \nCompanies Fund 1.12% 0.15\nGLOBAL SHARE \nStewart Investors Worldwide Leaders \nSustainability Fund 1.17% 0.10\nStewart Investors Worldwide Sustainability \nFund 1.02% 0.10\n", + "---Example 3 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"First Sentier Australian Share Fund\", \"share name\": \"First Sentier Australian Share Fund\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"First Sentier Concentrated Australian Share Fund\", \"share name\": \"First Sentier Concentrated Australian Share Fund\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"First Sentier Imputation Fund\", \"share name\": \"First Sentier Imputation Fund\", \"management_fee_and_costs\": 0.97, \"management_fee\": 0.97, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"First Sentier Australian Small Companies Fund\", \"share name\": \"First Sentier Australian Small Companies Fund\", \"management_fee_and_costs\": 1.12, \"management_fee\": 1.12, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"Stewart Investors Worldwide Leaders Sustainability Fund\", \"share name\": \"Stewart Investors Worldwide Leaders Sustainability Fund\", \"management_fee_and_costs\": 1.17, \"management_fee\": 1.17, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Stewart Investors Worldwide Sustainability Fund\", \"share name\": \"Stewart Investors Worldwide Sustainability Fund\", \"management_fee_and_costs\": 1.02, \"management_fee\": 1.02, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}", + "\n", + "---Example 4 Start---", + "\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund – Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n", + "---Example 4 End---", + "The output should be:", + "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund – Class A\", \"share name\": \"Allan Gray Australian Equity Fund – Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}" ], "performance_fee_costs": [ "Performance fees is share class level data.", diff --git a/main.py b/main.py index 2df8420..c927a60 100644 --- a/main.py +++ b/main.py @@ -1560,7 +1560,7 @@ if __name__ == "__main__": # "544886057", # "550769189", # "553449663"] - # special_doc_id_list = ["414751292"] + # special_doc_id_list = ["446324179"] # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"] pdf_folder: str = r"/data/aus_prospectus/pdf/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"