support benchmark name data extraction

This commit is contained in:
Blade He 2025-02-26 10:05:46 -06:00
parent 357bb6d580
commit f467945cd4
5 changed files with 76 additions and 27 deletions

View File

@ -8,7 +8,7 @@
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund","benchmark name"]}, "benchmark_name": {"english": ["benchmark fund","benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment", "contributions and access to your investment", "start your investment with"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}

View File

@ -8,7 +8,7 @@
"sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]}, "sell_spread": {"english": ["sell-spread", "sell spread", "buy/sell spreads", "BUY-SELL SPREAD", "Buy:", "Sell:"]},
"administration_fees": {"english": ["administration fee", "administration fees","admin fee"]}, "administration_fees": {"english": ["administration fee", "administration fees","admin fee"]},
"interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]}, "interposed_vehicle_performance_fee_cost": {"english": ["Performance fees charged by interposed vehicles","interposed vehicle performance fee cost", "interposed vehicle performance"]},
"benchmark_name": {"english": ["benchmark fund","benchmark name"]}, "benchmark_name": {"english": ["benchmark fund", "benchmark name", "Benchmark", "aims to outperform"]},
"minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]}, "minimum_initial_investment": {"english": ["minimum initial investment","initial investment", "initial investment amount", "minimum investment amounts", "Contributions and access to your investment"]},
"recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]}, "recoverable_expenses": {"english": ["recoverable expenses","recoverable cost","expense recoveries"]},
"indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]} "indirect_costs": {"english": ["indirect cost","indirect fees","indirect fee","indirect costs"]}

View File

@ -70,6 +70,7 @@ class DataExtraction:
self.datapoints = datapoints self.datapoints = datapoints
self.instructions_config = self.get_instructions_config() self.instructions_config = self.get_instructions_config()
self.datapoint_level_config = self.get_datapoint_level() self.datapoint_level_config = self.get_datapoint_level()
self.datapoint_type_config = self.get_datapoint_type()
self.datapoint_name_config = self.get_datapoint_name() self.datapoint_name_config = self.get_datapoint_name()
self.datapoint_reported_name_config, self.non_english_reported_name_config = \ self.datapoint_reported_name_config, self.non_english_reported_name_config = \
self.get_datapoint_reported_name() self.get_datapoint_reported_name()
@ -150,7 +151,6 @@ class DataExtraction:
datapoint_reported_name_config[datapoint] = reported_name_list datapoint_reported_name_config[datapoint] = reported_name_list
return datapoint_reported_name_config, non_english_reported_name_config return datapoint_reported_name_config, non_english_reported_name_config
def get_provider_mapping(self): def get_provider_mapping(self):
if len(self.document_mapping_info_df) == 0: if len(self.document_mapping_info_df) == 0:
return pd.DataFrame() return pd.DataFrame()
@ -169,7 +169,6 @@ class DataExtraction:
pdf_util = PDFUtil(self.pdf_file) pdf_util = PDFUtil(self.pdf_file)
return pdf_util.extract_image_from_page(page_index=page_index, return pdf_util.extract_image_from_page(page_index=page_index,
output_folder=self.output_image_folder) output_folder=self.output_image_folder)
def get_instructions_config(self) -> dict: def get_instructions_config(self) -> dict:
instructions_config_file = os.path.join(self.instruction_folder, "data_extraction_prompts_config.json") instructions_config_file = os.path.join(self.instruction_folder, "data_extraction_prompts_config.json")
with open(instructions_config_file, "r", encoding="utf-8") as f: with open(instructions_config_file, "r", encoding="utf-8") as f:
@ -182,6 +181,12 @@ class DataExtraction:
datapoint_level = json.load(f) datapoint_level = json.load(f)
return datapoint_level return datapoint_level
def get_datapoint_type(self) -> dict:
datapoint_type_file = os.path.join(self.configuration_folder, "datapoint_type.json")
with open(datapoint_type_file, "r", encoding="utf-8") as f:
datapoint_type = json.load(f)
return datapoint_type
def get_datapoint_name(self) -> dict: def get_datapoint_name(self) -> dict:
datapoint_name_file = os.path.join(self.configuration_folder, "datapoint_name.json") datapoint_name_file = os.path.join(self.configuration_folder, "datapoint_name.json")
with open(datapoint_name_file, "r", encoding="utf-8") as f: with open(datapoint_name_file, "r", encoding="utf-8") as f:
@ -599,12 +604,24 @@ class DataExtraction:
previous_page_last_fund: str = None) -> dict: previous_page_last_fund: str = None) -> dict:
# If can't find numberic value, e.g. 1.25 or 3,88 # If can't find numberic value, e.g. 1.25 or 3,88
# apply Vision ChatGPT to extract data # apply Vision ChatGPT to extract data
exist_data_point_value_text = False
for datapoint in page_datapoints:
if self.datapoint_type_config.get(datapoint, "") == "text":
exist_data_point_value_text = True
break
exist_numeric_value = False
special_code_all = []
page_text_line_count = 100
if not exist_data_point_value_text:
special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f" special_code_regex = r"\x10|\x11|\x12|\x13|\x14|\x15|\x16|\x17|\x18|\x19|\x1a|\x1b|\x1c|\x1d|\x1e|\x1f"
special_code_all = [code for code in re.findall(special_code_regex, page_text) special_code_all = [code for code in re.findall(special_code_regex, page_text)
if code != "\n"] if code != "\n"]
page_text_line_count = len(page_text.split("\n")) page_text_line_count = len(page_text.split("\n"))
numeric_regex = r"\d+(\.|\,)\d+" numeric_regex = r"\d+(\.|\,)\d+"
if not re.search(numeric_regex, page_text) or page_text_line_count < 3 or len(special_code_all) > 100: exist_numeric_value = (re.search(numeric_regex, page_text) is not None)
if not exist_data_point_value_text and (not exist_numeric_value or
page_text_line_count < 3 or
len(special_code_all) > 100):
logger.info(f"Can't find numberic value in page {page_num}, apply Vision ChatGPT to extract data") logger.info(f"Can't find numberic value in page {page_num}, apply Vision ChatGPT to extract data")
return self.extract_data_by_page_image( return self.extract_data_by_page_image(
page_num=page_num, page_num=page_num,

View File

@ -60,7 +60,7 @@
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
"- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:", "- 6. Please ignore these words as fund names, it means never extract these words as fund names. They are:",
"\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"." "\"Ready-made portfolios\", \"Simple choice\", \"Build-your-own portfolio\"."
], ],
@ -127,45 +127,45 @@
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n",
"---Example 2 End---", "---Example 2 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}]}",
"\n", "\n",
"A.2 The data value with gross and net, please ignore gross value, output the net value only.", "A.2 The data value with gross and net, please ignore gross value, output the net value only.",
"---Example 2 Start---", "---Example 2 Start---",
"Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n", "Small Fund \nManagement fees \nand costs \n1.17% pa (gross)/2.51% pa (net) \n",
"---Example 2 End---", "---Example 2 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]", "{\"data\": [{\"fund name\": \"Small Fund\", \"share name\": \"Small Fund\", \"management_fee_and_costs\": 2.51, \"management_fee\": 2.51}]}",
"B. If there are multiple Management fee and costs sub-columns, here is the rule: ", "B. If there are multiple Management fee and costs sub-columns, here is the rule: ",
"With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".", "With \"Management fees\" and \"Indirect fee\", sum the values from these two columns: \"Management fees\" + \"Indirect fee\".",
"---Example Start---", "---Example Start---",
"\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n", "\n\nManagement fees \nManagement fees and costs \nIndirect Fee \nPerformance Fees \nTransaction Costs \nTotal \nMLC diversified investment \noption \nMLC Horizon 2 \nIncome Portfolio \n1.35% p.a. \n0.07% p.a. \n0.06% p.a. \n0.01% p.a. \n1.49% p.a. \n",
"---Example End---", "---Example End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]", "{\"data\": [{\"fund name\": \"MLC Horizon 2 Income Portfolio\", \"share name\": \"MLC Horizon 2 Income Portfolio\", \"management_fee_and_costs\": 1.42, \"management_fee\": 1.35, \"indirect_costs\": 0.07, \"performance_fee\": 0.06}]}",
"\n", "\n",
"C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".", "C. If only find \"Management fees and costs\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
"---Example 1 Start---", "---Example 1 Start---",
"The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.", "The fees and costs for managing \nyour investment \nManagement fees and costs \n1 \n• \nSPDR World: 0.30% per annum of net asset \nvalue. This is reduced to 0.18% per annum of net \nasset value with effect from 14 February 2022.",
"---Example 1 End---", "---Example 1 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]", "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}]}",
"---Example 2 Start---", "---Example 2 Start---",
"Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n", "Management Fees and Costs \n\nAs at the date of this PDS, Management Fees and Costs will be capped at: \n\n• 0.18% pa of net asset value for SPDR World \n\n• 0.21% pa of net asset value for SPDR World (Hedged) \n\n",
"---Example 2 End---", "---Example 2 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]", "{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
"D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"", "D. With table header: \"Management Fees and costs (A)\" and \"(A)+(B) + (C) = (D) Total Fees and Costs\", please only focus the values under \"Management Fees and costs (A)\"",
"Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n", "Please get the first \"Entry Fee Option\" and \"Estimated Other investment costs\" sub-columns values, and sum as the management_fee_and_costs and management_fee value, ignore other columns values \n",
"---Example 1 Start---", "---Example 1 Start---",
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n", "Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n",
"---Example 1 End---", "---Example 1 End---",
"For this case, the first \"Entry Fee Option\" value is 0.47, the first \"Estimated Other investment costs\" value is 0.02, the sum is 0.49, so the output should be:", "For this case, the first \"Entry Fee Option\" value is 0.47, the first \"Estimated Other investment costs\" value is 0.02, the sum is 0.49, so the output should be:",
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]", "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]}",
"---Example 2 Start---", "---Example 2 Start---",
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n", "Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n",
"---Example 2 End---", "---Example 2 End---",
"For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:", "For this case, the first \"Entry Fee Option\" value is 1.44, the first \"Estimated Other investment costs\" value is 0.00, the sum is 1.44, so the output should be:",
"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]" "{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]}"
], ],
"buy_spread": [ "buy_spread": [
"Please don't extract data by the reported names for buy_spread or sell_spread, they are: ", "Please don't extract data by the reported names for buy_spread or sell_spread, they are: ",
@ -177,26 +177,58 @@
"The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan", "The minimum investment per Pension Plan account is \n$20,000. The minimum initial investment in any \ninvestment option is $5,000.\n\nPerpetual WealthFocus Pension Plan",
"---Example 1 End---", "---Example 1 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"minimum_initial_investment\": 5000}]", "{\"data\": [{\"fund name\": \"Perpetual WealthFocus Pension Plan\", \"minimum_initial_investment\": 5000}]}",
"\n", "\n",
"---Example 2 Start---", "---Example 2 Start---",
"Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.", "Prime Super \n\n5 Initial investment amount \n\nThe minimum net total initial investment amount is $10,000. Please note before you open your pension account: If you \nhave made personal contributions into super and wish to claim a tax deduction, you will have to lodge a Notice of \nIntent to Claim form with the relevant super fund (including Prime Super) before you roll your super into the Income \nStreams account.",
"---Example 2 End---", "---Example 2 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"Prime Super\", \"minimum_initial_investment\": 10000}]", "{\"data\": [{\"fund name\": \"Prime Super\", \"minimum_initial_investment\": 10000}]}",
"\n", "\n",
"---Example 3 Start---", "---Example 3 Start---",
"Minimum \nPlatform operators \nIndirect investors \ninvestment \namounts and their platform operators \nInitial $500,000 \nAdditional $5,000 \nMinimum investment amounts are subject to the arrangements between indirect investors \n", "Minimum \nPlatform operators \nIndirect investors \ninvestment \namounts and their platform operators \nInitial $500,000 \nAdditional $5,000 \nMinimum investment amounts are subject to the arrangements between indirect investors \n",
"---Example 3 End---", "---Example 3 End---",
"The minimum initial investment is under the \"Initial\", the value is $500,000.", "The minimum initial investment is under the \"Initial\", the value is $500,000.",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 500000}]", "{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 500000}]}",
"\n", "\n",
"---Example 4 Start---", "---Example 4 Start---",
"Contributions and access \nto your investment \n• \n• \nWe provide choice and flexibility for your investment with access to your money at anytime. \nStart your investment with as little as $1,000. \n• \nEstablish a regular savings plan. \n28 \n• \nYou can switch between the investment options and also rebalance within your selected \noptions at any time. \n• \nMinimum withdrawal $500. \n", "Contributions and access \nto your investment \n• \n• \nWe provide choice and flexibility for your investment with access to your money at anytime. \nStart your investment with as little as $1,000. \n• \nEstablish a regular savings plan. \n28 \n• \nYou can switch between the investment options and also rebalance within your selected \noptions at any time. \n• \nMinimum withdrawal $500. \n",
"---Example 4 End---", "---Example 4 End---",
"The output should be:", "The output should be:",
"{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 1000}]" "{\"data\": [{\"fund name\": \"unknown\", \"minimum_initial_investment\": 1000}]}"
],
"benchmark_name": [
"Benchmark is fund leval data, usually as index fund name, e.g. S&P/ASX 300 A-REIT Total Return Index ",
"Sometime, there are multiple benchmark names with weightings in the context, please extract them all including weightings and benchmark names.",
"Example for single benchmark name",
"---Example 1 Start---",
"MLC Property Securities Fund \nInvestment objective \nAims to outperform the Benchmark (after fees and before tax) over 5 year periods. \nBenchmark \nS&P/ASX 300 A-REIT Total Return Index \n",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Property Securities Fund\", \"benchmark_name\": \"S&P/ASX 300 A-REIT Total Return Index\"}]}",
"---Example 2 Start---",
"First Sentier Wholesale Global Listed Infrastructure Fund \n\nGLOBAL PROPERTY AND INFRASTRUCTURE SECURITIES \n\nObjective \n\nTo deliver capital growth and inflation \nprotected income by investing in \na globally diversified portfolio of \ninfrastructure securities. The fund aims \nto outperform the FTSE Global Core \nInfrastructure 50-50 (Net TR) Index \nhedged to Australian dollars over rolling \nthree-year periods before fees and taxes.",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"First Sentier Wholesale Global Listed Infrastructure Fund\", \"benchmark_name\": \"FTSE Global Core Infrastructure 50-50 (Net TR) Index\"}]}",
"---Example 3 Start---",
"MLC Horizon 5 Growth Portfolio \nInvestment objective\nAims to grow by more than inflation +3.5% pa (after fees and tax) over 10 years. \nBenchmark\nInflation is measured by the Consumer Price Index, calculated by the Australian Bureau of Statistics. \nHow the investment option is\nmanaged\nA diversified portfolio that s predominantly weighted towards the more traditionally growth-focused \nassets that tend to provide higher levels of long-term capital growth (eg shares), with a small exposure to \nthe more stable, defensive asset classes of cash and fixed income. \n",
"---Example 3 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"MLC Horizon 5 Growth Portfolio\", \"benchmark_name\": \"Consumer Price Index\"}]}",
"\n",
"Example for multiple benchmark names",
"---Example 1 Start---",
"Investment options other \nthan MLC portfolios \n\nFixed income \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nThe investment option may be \nsuited to you if... \n\nMinimum suggested time to \ninvest \n\nAsset allocation \n\nStandard Risk Measure \n\nInvestment objective \n\nBenchmark \n\nHow the investment option is \nmanaged \n\nThe investment option may be \nsuited to you if... \n\nMinimum suggested time to \ninvest \n\nAsset allocation \n\nStandard Risk Measure \n\nMacquarie Income Opportunities Fund \n\nThe fund aims to outperform the Benchmark over the medium term (before fees). It aims to provide \nhigher income returns than traditional cash investments at all stages of interest rate and economic \ncycles. \n\nBloomberg AusBond Bank Bill Index \n\nThe fund predominantly provides exposure to a wide range of domestic and global investment grade \nfloating and fixed rate instruments, asset-backed securities, and cash. The fund may also have \nopportunistic exposure to other fixed income sectors and instruments such as, high yield and emerging \n\nmarkets debt as well as other fixed income instruments. Interest rate risk will generally be hedged \nthrough the use of derivatives such as swaps and futures. \n\nThe investment process aims to reduce the risk of the fund being adversely affected by unexpected \nevents or downgrades in the credit rating of the fund s investments. A disciplined framework is used \nto analyse each sector and proposed investment to assess its risk. \n\nThe fund may be exposed to derivatives to implement its investment strategy. For example, protection \nmay be purchased on issuers that are believed to be over-valued or at risk of downgrade. These \npositions increase in value when the underlying instrument falls in value and decrease in value when \nthe underlying instrument rises in value. \n\nThe portfolio is generally hedged to Australian dollars. However, any exposure to emerging markets \ndebt issued in the local currency of the debt will generally be unhedged. Small active currency positions \nmay also be taken when the investment manager believes that there are opportunities to add value \nor hedge risks in the portfolio. \n\nyou want a medium term investment horizon, seeking a steady and reliable income stream. \n\n3 years \n\nAsset class \n\nInvestment grade credit* \n\nHigh yield \n\nEmerging markets debt** \n\nCash \n\n* Includes Australian and global investment grade credit. \n** May include holdings of sub-investment grade instruments. \n\nRanges \n\n0 100% \n\n0 25% \n\n0 25% \n\n0 100% \n\nMedium to high (estimate of 3 to 4 negative annual returns in any 20 year period) \n\nPIMCO Diversified Fixed Interest Fund - Wholesale Class \n\nTo achieve maximum total return by investing in underlying funds that invest in Australian and \nglobal bonds, and to seek to preserve capital through prudent investment management. \n\n50% Bloomberg Barclays Global Aggregate Index (Hedged in Australian dollars) and 50% Bloomberg \nAusBond Composite 0+ Yr Index \n\n",
"---Example 1 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Macquarie Income Opportunities Fund\", \"benchmark_name\": \"Bloomberg AusBond Bank Bill Index\"}, {\"fund name\": \"PIMCO Diversified Fixed Interest Fund - Wholesale Class\", \"benchmark_name\": \"50% Bloomberg Barclays Global Aggregate Index (Hedged in Australian dollars) and 50% Bloomberg AusBond Composite 0+ Yr Index\"}]}",
"---Example 2 Start---",
"Australian shares continued \n\nAusbil Australian Emerging Leaders Fund \nInvestment objective \nTo provide returns above the Benchmark over the medium to long term (before fees and tax). \nBenchmark \n70% S&P/ASX Midcap 50 Accumulation Index \n30% S&P/ASX Small Ordinaries Accumulation Index \nHow the investment option is \nmanaged \nThe fund predominantly invests in a portfolio of mid and small cap Australian equities primarily \nchosen from the S&P/ASX 300 Index, but generally excludes securities from the S&P/ASX 50 Index. \nAt all times the fund will favour sectors and specific companies which it believes will experience \npositive earnings revisions. \n",
"---Example 2 End---",
"The output should be:",
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}"
] ]
} }
}, },
@ -260,7 +292,7 @@
"\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\nAlphinity Sustainable Share Fund\n0.95\n0.60\n0.42\n1.55\n1.37\nAntipodes Global Fund\n1.20\n0.60\n0.42\n1.80\n1.62\n", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\nAlphinity Sustainable Share Fund\n0.95\n0.60\n0.42\n1.55\n1.37\nAntipodes Global Fund\n1.20\n0.60\n0.42\n1.80\n1.62\n",
"---Example End---", "---Example End---",
"Output:", "Output:",
"{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"management_fee_and_costs\": 1.37, \"management_fee\": 0.95, \"administration_fees\": 0.42}, {\"fund name\": \"Antipodes Global Fund\", \"share name\": \"Antipodes Global Fund\", \"management_fee_and_costs\": 1.62, \"management_fee\": 1.20, \"administration_fees\": 0.42}]", "{\"data\": [{\"fund name\": \"Allan Gray Australian Equity Fund\", \"share name\": \"Class A\", \"management_fee_and_costs\": 1.19, \"management_fee\": 0.77, \"administration_fees\": 0.42}, {\"fund name\": \"Alphinity Sustainable Share Fund\", \"share name\": \"Alphinity Sustainable Share Fund\", \"management_fee_and_costs\": 1.37, \"management_fee\": 0.95, \"administration_fees\": 0.42}, {\"fund name\": \"Antipodes Global Fund\", \"share name\": \"Antipodes Global Fund\", \"management_fee_and_costs\": 1.62, \"management_fee\": 1.20, \"administration_fees\": 0.42}]}",
"Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.", "Fund level data: (\"fund name\" and \"datapoint_name\") and share level data: (\"fund name\", \"share name\", \"datapoint_name\") should be output separately.",
"The output should be JSON format, the format is like below example(s):" "The output should be JSON format, the format is like below example(s):"
], ],

View File

@ -1042,8 +1042,8 @@ def batch_run_documents(
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
) )
re_run_extract_data = True re_run_extract_data = False
re_run_mapping_data = True re_run_mapping_data = False
force_save_total_data = True force_save_total_data = True
calculate_metrics = False calculate_metrics = False
@ -1486,7 +1486,7 @@ if __name__ == "__main__":
# "555377021", # "555377021",
# "555654388", # "555654388",
# ] # ]
special_doc_id_list: list = ["446324179"] # special_doc_id_list: list = ["411062815", "462770987", "420339794", "441280757"]
pdf_folder: str = r"/data/aus_prospectus/pdf/" pdf_folder: str = r"/data/aus_prospectus/pdf/"
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
output_extract_data_child_folder: str = ( output_extract_data_child_folder: str = (