Merge branches 'aus_prospectus_ravi' and 'aus_prospectus_ravi' of https://msstash.morningstar.com/scm/dc/dc-ml-emea-ar into aus_prospectus_ravi

This commit is contained in:
Ravi Maheshwari 2025-03-13 17:34:35 +05:30
commit 1f6b781b12
3 changed files with 37 additions and 4 deletions

View File

@ -896,6 +896,8 @@ def calculate_metrics_based_db_data_file(audit_file_path: str = r"/data/aus_pros
verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "") verify_file_name = os.path.basename(verify_file_path).replace(".xlsx", "")
if is_for_all: if is_for_all:
verify_file_name = f"{verify_file_name}_all" verify_file_name = f"{verify_file_name}_all"
if zero_equal_none:
verify_file_name = f"{verify_file_name}_zero_equal_none"
metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx" metrics_file_name = f"metrics_{verify_file_name}_{len(document_id_list)}_documents_4_dps_not_strict.xlsx"
output_file = os.path.join(output_folder, metrics_file_name) output_file = os.path.join(output_folder, metrics_file_name)
with pd.ExcelWriter(output_file) as writer: with pd.ExcelWriter(output_file) as writer:
@ -1426,7 +1428,7 @@ if __name__ == "__main__":
audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx" audit_file_path: str = r"/data/aus_prospectus/ground_truth/phase2_file/46_documents/46_documents_ground_truth_with_mapping.xlsx"
audit_data_sheet: str = "Sheet1" audit_data_sheet: str = "Sheet1"
verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/merged_mapping_data_info_46_documents_by_text.xlsx" verify_file_path: str = r"/data/aus_prospectus/output/mapping_data/total/mapping_data_info_46_documents_by_text_20250313024715.xlsx"
verify_data_sheet: str = "total_mapping_data" verify_data_sheet: str = "total_mapping_data"
# verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt" # verify_document_list_file: str = "./sample_documents/aus_prospectus_29_documents_sample.txt"
verify_document_list_file_list = [None, verify_document_list_file_list = [None,

View File

@ -237,6 +237,11 @@
"---Example 3 End---", "---Example 3 End---",
"The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:", "The relevant values: 0.67 and 1.17, are in the range, should ignore, so the output should be:",
"{\"data\": []}", "{\"data\": []}",
"---Example 4 Start---",
"Type of fee or cost Amount 2 How and when paid \nOngoing annual fees and costs 3 \nManagement fees and costs \n0.82% to 1.22% p.a. (estimated) \n",
"---Example 4 End---",
"The relevant values: 0.82 and 1.22, are in the range, should ignore, so the output should be:",
"{\"data\": []}",
"\n", "\n",
"H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.", "H. If the management fee and costs value including the performance fee, please exclude or subtract the performance fee value, just output the management fee and costs value.",
"---Example 1 Start---", "---Example 1 Start---",
@ -260,6 +265,8 @@
"The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.",
"The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".",
"Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.",
"If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.",
"If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.",
"So the output should be:", "So the output should be:",
"{\"data\": [{\"fund name\": \"CFS Real Return Class A\", \"share name\": \"CFS Real Return Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "{\"data\": [{\"fund name\": \"CFS Real Return Class A\", \"share name\": \"CFS Real Return Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}",
"\n", "\n",
@ -356,13 +363,37 @@
"buy_spread": [ "buy_spread": [
"A. Exclude reported name", "A. Exclude reported name",
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
"Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs)", "Transaction costs buy/sell spread recovery, Transaction costs reducing return of the investment option (net transaction costs), ",
"Estimated transaction costs offset by buy/sell spreads (% pa), ",
"---Example Start---",
"Option name \nTotal estimated \ntransaction costs \n(% pa) \nEstimated transaction costs \noffset by buy/sell spreads \n(% pa) \nEstimated transaction costs \nborne by the option \n(% pa) \nGenerations Defensive \n0.21 \n0.04 \n0.17 \n",
"---Example End---",
"The data should be excluded, the output should be:",
"{\"data\": []}",
"B. Simple case with simple table structure:", "B. Simple case with simple table structure:",
"---Example 1 Start---", "---Example 1 Start---",
"Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n", "Investment option Buy cost Sell cost \nLifestyle Growth 0% 0%\nLifestyle Balanced 0% 0%\nProperty 0.10% 0.10%\n",
"---Example 1 End---", "---Example 1 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}" "{\"data\": [{\"fund name\": \"Lifestyle Growth\", \"share name\": \"Lifestyle Growth\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Lifestyle Balanced\", \"share name\": \"Lifestyle Balanced\", \"buy_spread\": 0, \"sell_spread\": 0}, {\"fund name\": \"Property\", \"share name\": \"Property\", \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
"\n",
"---Example 2 Start---",
"Fund name \nManagement fees \nand costs (% p.a.) \n1 \nTransaction costs \n(% p.a.) \n1 \nBuy/sell spread \n(%) \n2 \nEveryday Investing Balanced Fund 0.35 0.05 0.00\n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Everyday Investing Balanced Fund\", \"share name\": \"Everyday Investing Balanced Fund\", \"management_fee_and_costs\": 0.35, \"management_fee\": 0.35, \"buy_spread\": 0, \"sell_spread\": 0}]}",
"\n",
"---Example 3 Start---",
"Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nAUSTRALIAN SHARE \nFirst Sentier Australian Share Fund 0.96% 0.10\nFirst Sentier Concentrated Australian \nShare Fund 0.96% 0.10\nFirst Sentier Imputation Fund 0.97% 0.15\nAUSTRALIAN SHARE SMALL COMPANIES \nFirst Sentier Australian Small \nCompanies Fund 1.12% 0.15\nGLOBAL SHARE \nStewart Investors Worldwide Leaders \nSustainability Fund 1.17% 0.10\nStewart Investors Worldwide Sustainability \nFund 1.02% 0.10\n",
"---Example 3 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"First Sentier Australian Share Fund\", \"share name\": \"First Sentier Australian Share Fund\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"First Sentier Concentrated Australian Share Fund\", \"share name\": \"First Sentier Concentrated Australian Share Fund\", \"management_fee_and_costs\": 0.96, \"management_fee\": 0.96, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"First Sentier Imputation Fund\", \"share name\": \"First Sentier Imputation Fund\", \"management_fee_and_costs\": 0.97, \"management_fee\": 0.97, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"First Sentier Australian Small Companies Fund\", \"share name\": \"First Sentier Australian Small Companies Fund\", \"management_fee_and_costs\": 1.12, \"management_fee\": 1.12, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"Stewart Investors Worldwide Leaders Sustainability Fund\", \"share name\": \"Stewart Investors Worldwide Leaders Sustainability Fund\", \"management_fee_and_costs\": 1.17, \"management_fee\": 1.17, \"buy_spread\": 0.1, \"sell_spread\": 0.1}, {\"fund name\": \"Stewart Investors Worldwide Sustainability Fund\", \"share name\": \"Stewart Investors Worldwide Sustainability Fund\", \"management_fee_and_costs\": 1.02, \"management_fee\": 1.02, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
"\n",
"---Example 4 Start---",
"\n\nInvestment option \nGross total \ntransaction costs 1 \n% p.a. \nNet total transaction \ncosts 2 \n% p.a. \nBuy-sell \nspread (ITC) 3 \n% \nAllan Gray Australian Equity Fund Class A 0.06 0.00 0.40\nAlphinity Sustainable Share Fund 0.15 0.02 0.40\n",
"---Example 4 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund Class A\", \"share name\": \"Allan Gray Australian Equity Fund Class A\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"buy_spread\": 0.4, \"sell_spread\": 0.4}]}"
], ],
"performance_fee_costs": [ "performance_fee_costs": [
"Performance fees is share class level data.", "Performance fees is share class level data.",

View File

@ -1560,7 +1560,7 @@ if __name__ == "__main__":
# "544886057", # "544886057",
# "550769189", # "550769189",
# "553449663"] # "553449663"]
# special_doc_id_list = ["414751292"] # special_doc_id_list = ["446324179"]
# special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"] # special_doc_id_list = ["391080133", "391080140", "401212184", "412778803", "420339794", "454036250", "414751292"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"