optimize instructions for management fee and costs.
support dynamic loading complex instructions by keywords
This commit is contained in:
parent
d3be711859
commit
f4b4d00f58
|
|
@ -321,6 +321,7 @@
|
|||
"Edu": "Education",
|
||||
"Elevation Fds (IE)": "Elevation UCITS Funds (Ireland) ICAV",
|
||||
"E": "Elite",
|
||||
"EF": "Entry Fee",
|
||||
"Emgnt": "Emergente",
|
||||
"Em": "Emerging",
|
||||
"Emerg": "Emerging",
|
||||
|
|
@ -678,6 +679,7 @@
|
|||
"Nbg Bm": "Neuberger Berman",
|
||||
"Nflz": "Neuflize",
|
||||
"Netrl": "Neutral",
|
||||
"NEF": "Nil Entry",
|
||||
"New Capital": "New Capital Fund Lux",
|
||||
"Nwtn": "Newton",
|
||||
"NN (B) Invest": "NN (B) Invest",
|
||||
|
|
|
|||
|
|
@ -42,7 +42,9 @@ def get_abb_json(doc_source: str = "aus_prospectus"):
|
|||
def get_abbre_format_str(fundname, doc_source: str = "aus_prospectus"):
|
||||
"""Replaces abbreviations in a fund name with their expanded forms."""
|
||||
# Convert fund name to lowercase while matching
|
||||
f_list = fundname.lower().split()
|
||||
# replace special characters with space
|
||||
f_list = re.sub(r'[^a-zA-Z0-9\s]', ' ', fundname).lower().split()
|
||||
# f_list = fundname.lower().split()
|
||||
get_abb_json(doc_source)
|
||||
updated_doc_fname_words = [ABB_JSON.get(word, word).lower() for word in f_list]
|
||||
return " ".join(updated_doc_fname_words)
|
||||
|
|
|
|||
|
|
@ -575,7 +575,7 @@ class DataExtraction:
|
|||
previous_page_datapoints = []
|
||||
previous_page_fund_name = None
|
||||
for page_num, page_text in self.page_text_dict.items():
|
||||
# if page_num != 8:
|
||||
# if page_num != 40:
|
||||
# continue
|
||||
if page_num in handled_page_num_list:
|
||||
continue
|
||||
|
|
@ -1501,6 +1501,10 @@ class DataExtraction:
|
|||
instructions.append("\n")
|
||||
|
||||
special_rule_info = data_business_features.get("special_rule", {})
|
||||
# The reason why apply special_rule_by_keywords is:
|
||||
# 1. The special rule is very complex, prompsts are very long.
|
||||
# 2. To load it by keywords, is to avoid for simple case, the prompts are too long.
|
||||
complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "")
|
||||
with_special_rule_title = False
|
||||
for datapoint in datapoints:
|
||||
special_rule_list = special_rule_info.get(datapoint, [])
|
||||
|
|
@ -1511,6 +1515,26 @@ class DataExtraction:
|
|||
special_rule = "\n".join(special_rule_list)
|
||||
instructions.append(special_rule)
|
||||
instructions.append("\n\n")
|
||||
if page_text is None or len(page_text) == 0:
|
||||
continue
|
||||
complex_special_rule_list = complex_special_rule.get(datapoint, [])
|
||||
for complex_special_rule in complex_special_rule_list:
|
||||
complex_keywords = complex_special_rule.get("keywords", [])
|
||||
if len(complex_keywords) == 0:
|
||||
continue
|
||||
exist_keywords = False
|
||||
for special_keywords in complex_keywords:
|
||||
special_keywrods_regex = add_slash_to_text_as_regex(special_keywords)
|
||||
if special_keywords in page_text or \
|
||||
re.search(special_keywrods_regex, page_text) is not None:
|
||||
exist_keywords = True
|
||||
break
|
||||
if exist_keywords:
|
||||
complex_prompts_list = complex_special_rule.get("prompts", [])
|
||||
if len(complex_prompts_list) > 0:
|
||||
complex_prompts = "\n".join(complex_prompts_list)
|
||||
instructions.append(complex_prompts)
|
||||
instructions.append("\n\n")
|
||||
instructions.append("\n")
|
||||
|
||||
instructions.append("Special cases:\n")
|
||||
|
|
|
|||
|
|
@ -164,48 +164,23 @@
|
|||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"SPDR World\", \"share name\": \"SPDR World\", \"management_fee_and_costs\": 0.18, \"management_fee\": 0.18}, {\"fund name\": \"SPDR World (Hedged)\", \"share name\": \"SPDR World (Hedged)\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.21}]}",
|
||||
"\n",
|
||||
"D. With table header: \"Management Fees and costs (A)\" which span 3 sub-columns.\n Please get the 1st column number and 3rd column number from the sub-columns values,",
|
||||
"and sum them as the management_fee_and_costs and management_fee value, ignore other columns values.",
|
||||
"Attention:",
|
||||
"1. For this case, management_fee is equal with management_fee_and_costs.",
|
||||
"2. There are only two decimal places for each number.",
|
||||
"3. Please totally ignore the message in context: \"(A)+(B) + (C) = (D) Total Fees and Costs\"",
|
||||
"4. The values need to sum the the 1st number and the 3rd number.",
|
||||
"Example to calculation pipeline for this case:",
|
||||
"1.54 2.390.13 0.410.00 2.08 2.93",
|
||||
"a. split the number by regex as \\d\\.\\d\\d format,",
|
||||
"and get 7 numbers: 1.54 2.39 0.13 0.41 0.00 2.08 2.93",
|
||||
"b. Sum the 1st number and the 3rd number only: 1.54 + 0.13 = 1.67",
|
||||
"c. Attention: please ignore other numbers, especially the 4th number: 0.41, the sum **is not** 1.54 + 0.13 + 0.41 = 2.08!!",
|
||||
"d. management_fee_and_costs is equal with management_fee, both of them are 1.54 + 0.13 = 1.67 for this case.",
|
||||
"More examples:",
|
||||
"D. If only find \"Management fees\", please output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||
"---Example 1 Start---",
|
||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.02 0.000.00 0.49 1.32\n",
|
||||
"Underlying FundManagement fee component \nVanguard High Growth Index Fund1.50% p.a. of the NAV of the Underlying Fund\n",
|
||||
"---Example 1 End---",
|
||||
"For this case, the 1st number is 0.47, the 3rd number value is 0.02, the sum is 0.49, management_fee is equal with management_fee_and_costs, so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged)\", \"share name\": \"OnePath International Shares Index (Hedged)\", \"management_fee_and_costs\": 0.49, \"management_fee\": 0.49}]}",
|
||||
"---Example 2 Start---",
|
||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\n",
|
||||
"---Example 2 End---",
|
||||
"For this case, the 1st number is 1.44, the 3rd number is 0.00, the sum is 1.44, management_fee is equal with management_fee_and_costs, so the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Pendal Concentrated Global Shares Hedged II\", \"share name\": \"Pendal Concentrated Global Shares Hedged II\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44}]}",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Vanguard High Growth Index Fund\", \"share name\": \"Vanguard High Growth Index Fund\", \"management_fee_and_costs\": 1.5, \"management_fee\": 1.5}]}",
|
||||
"\n",
|
||||
"E. If the table with columns:",
|
||||
"\"Administration fees (% pa)\", \"Investment fees (% pa)\" and \"Estimated other investment costs (% pa)\"",
|
||||
"The administration_fees is \"Administration fees (% pa)\"",
|
||||
"The management_fee is \"Investment fees (% pa)\".",
|
||||
"The management_fee_and_costs is \"Investment fees (% pa)\" + \"Estimated other investment costs (% pa)\".",
|
||||
"E. If with columns \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", \"Performance Fee\", \"Transaction costs\", \"Total investment fees and costs\", please only extraction values from \"Investment fees and costs\" or \"Investment fees and costs (excl Performance Fees)\", output the relevant same value for both of data point keys: \"management_fee_and_costs\" and \"management_fee\".",
|
||||
"---Example 1 Start---",
|
||||
"Investment \noption \nAdministration fees and \nestimated administration costs \nInvestment fees and estimated \ninvestment costs \nEstimated investment \ncosts \nAdministration \nfees \n(% pa) \nInvestment \nfees \n(% pa) \n2 \nEstimated \ntotal \nongoing \nEstimated \nadministration \ncosts \n(% pa) \n1 \nEstimated \nperformance \nfees \n(% pa) \n3 \nEstimated \ntransaction \ncosts \n(% pa) \n5 \nEstimated \nother \ninvestment \ncosts \n(% pa) \n4 \nannual \nfees and \ncosts \n(% pa) \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nProperty and infrastructure \nLazard Global \nListed \nInfrastructure \n0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n",
|
||||
"\n\nInvestment option \nInvestment fees \nand costs (excl \nPerformance Fees) \nPerformance \nFee \nTransaction \ncosts \nTotal \ninvestment \nfees and costs \nBalanced 0.53% 0.43% 0.13%1.09% \nCapital Stable \n0.32% \n0.18% \n0.09% \n0.59% \n",
|
||||
"---Example 1 End---",
|
||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
||||
"But the data points numbers order in data row (for example: 0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n) is correct as initial table structure.",
|
||||
"Please pay attention below information",
|
||||
"Assume the column sequence number is from 1.",
|
||||
"\"Administration fees (% pa)\" values are as the column 1 numbers, \"Investment fees (% pa)\" values are as the column 3 numbers, \"Estimated other investment costs (% pa)\" values are as the column 5 numbers.",
|
||||
"For fund: Lazard Global Listed Infrastructure, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.8, the management_fee_and_costs should be 0.88 = 0.8(the column 3 number) + 0.08 (the column 5 number)",
|
||||
"Therefore, the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0, \"administration_fees\": 0.25}]}, {\"fund name\": \"Lazard Global Listed Infrastructure\", \"share name\": \"Lazard Global Listed Infrastructure\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.08, \"administration_fees\": 0.25}",
|
||||
"For this example, please ignore the \"Total investment fees and costs\" and \"Transaction costs\" columns, ",
|
||||
"just output the values from \"Investment fees and costs (excl Performance Fees)\" as management_fee and management_fee_and_costs, ",
|
||||
"output the values from \"Performance Fee\" as performance_fee.",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Balanced\", \"share name\": \"Balanced\", \"management_fee_and_costs\": 0.53, \"management_fee\": 0.53, \"performance_fee\": 0.43}, {\"fund name\": \"Capital Stable\", \"share name\": \"Capital Stable\", \"management_fee_and_costs\": 0.32, \"management_fee\": 0.32, \"performance_fee\": 0.18}]}",
|
||||
"\n",
|
||||
"F. If the management fee/ management fee and costs is with the range, e.g. 0.05% to 1.00%, please ignore and output empty.",
|
||||
"---Example 1 Start---",
|
||||
"Fees and costs summary \n\nLifeplan Investment Bond \n\nType of fee or cost Amount How and when paid \nOngoing annual fees and costs \nManagement fees and costs 6, 7 \n• \nadministration fee 1,2 of 0.60% p.a. gross of tax \ndeductions (or 0.42% p.a. net of tax deductions) \n7 , \nThe fees and costs for managing \nyour investment \n• \nless \nThe administration fee is calculated and accrued \ndaily and paid monthly in arrears from the \ninvestment option. The administration fee can be \nnegotiated with wholesale clients. 2 \nadministration fee rebate for balances of \n$500,000 or more (refer to ‘Administration fee \nrebate’ section), \nFor the Lifeplan Capital Guaranteed investment \noption \nplus \n• \nThe investment option management costs for each \ninvestment option are shown ‘in the ‘Management \nfees and costs’ section below. \ninvestment option management cost 3 charged \nby the fund managers to manage the underlying \nportfolio estimated between 0.26% and 1.82% p.a. \nfor the previous financial year for the investment \noption. 8 \n",
|
||||
|
|
@ -339,6 +314,117 @@
|
|||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Ausbil Australian Emerging Leaders Fund\", \"benchmark_name\": \"70% S&P/ASX Midcap 50 Accumulation Index 30% S&P/ASX Small Ordinaries Accumulation Index\"}]}"
|
||||
]
|
||||
},
|
||||
"sepcial_rule_by_keywords":
|
||||
{
|
||||
"management_fee_and_costs": [
|
||||
{
|
||||
"keywords": ["Estimated investment \ncosts \nAdministration \nfees"],
|
||||
"prompts": ["Complex management fee and costs rule:",
|
||||
"If the table with columns:",
|
||||
"\"Administration fees (% pa)\", \"Investment fees (% pa)\" and \"Estimated other investment costs (% pa)\"",
|
||||
"The administration_fees is \"Administration fees (% pa)\"",
|
||||
"The management_fee is \"Investment fees (% pa)\".",
|
||||
"The management_fee_and_costs is \"Investment fees (% pa)\" + \"Estimated other investment costs (% pa)\".",
|
||||
"---Example 1 Start---",
|
||||
"Investment \noption \nAdministration fees and \nestimated administration costs \nInvestment fees and estimated \ninvestment costs \nEstimated investment \ncosts \nAdministration \nfees \n(% pa) \nInvestment \nfees \n(% pa) \n2 \nEstimated \ntotal \nongoing \nEstimated \nadministration \ncosts \n(% pa) \n1 \nEstimated \nperformance \nfees \n(% pa) \n3 \nEstimated \ntransaction \ncosts \n(% pa) \n5 \nEstimated \nother \ninvestment \ncosts \n(% pa) \n4 \nannual \nfees and \ncosts \n(% pa) \nCash \nPerpetual Cash \n0.10% \n0.00% \n0.00% \nn/a \n0.00% \n0.02% \n0.12% \nFixed income and credit \nBentham Global \nIncome \n0.25% \n0.00% \n0.67% \nn/a \n0.00% \n0.05% \n0.97% \nProperty and infrastructure \nLazard Global \nListed \nInfrastructure \n0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n",
|
||||
"---Example 1 End---",
|
||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
||||
"But the data points numbers order in data row (for example: 0.25% \n0.00% \n0.80% \nn/a \n0.08% \n0.09% \n1.22% \n) is correct as initial table structure.",
|
||||
"Please pay attention below information",
|
||||
"Assume the column sequence number is from 1.",
|
||||
"\"Administration fees (% pa)\" values are as the column 1 numbers, \"Investment fees (% pa)\" values are as the column 3 numbers, \"Estimated other investment costs (% pa)\" values are as the column 5 numbers.",
|
||||
"For fund: Lazard Global Listed Infrastructure, the administration_fees should be the column 1 number: 0.25, the management_fee should be the column 3 number: 0.8, the management_fee_and_costs should be 0.88 = 0.8(the column 3 number) + 0.08 (the column 5 number)",
|
||||
"Therefore, the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Perpetual Cash\", \"share name\": \"Perpetual Cash\", \"management_fee_and_costs\": 0, \"management_fee\": 0, \"administration_fees\": 0.10}, {\"fund name\": \"Bentham Global Income\", \"share name\": \"Bentham Global Income\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0, \"administration_fees\": 0.25}]}, {\"fund name\": \"Lazard Global Listed Infrastructure\", \"share name\": \"Lazard Global Listed Infrastructure\", \"management_fee_and_costs\": 0.88, \"management_fee\": 0.08, \"administration_fees\": 0.25}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"keywords": ["Entry Fee \nNil Entry"],
|
||||
"prompts": ["Complex management fee and costs rule:",
|
||||
"If the table with columns:",
|
||||
"\"Entry Fee option\", \"Nil Entry Free option\", \"Estimated other investment costs\", \"Estimated Performance fees (B)\"",
|
||||
"The performance_fee is \"Estimated Performance fees (B)\"",
|
||||
"The fund name's tail is \"Entry Fee\" for \"Entry Fee option\", e.g. if fund name is \"MultiSeries 30\", the Entry Fee fund name is \"MultiSeries 30 Entry Fee\"",
|
||||
"The fund name's tail is \"Nil Entry\" for \"Nil Entry Free option\", e.g. if fund name is \"MultiSeries 30\", the Nil Entry fund name is \"MultiSeries 30 Nil Entry\".",
|
||||
"For Entry Fee fund, both of management_fee and management_fee_and_costs are \"Entry Fee option\" + \"Estimated other investment costs\".",
|
||||
"For Nil Entry fund, both of management_fee and management_fee_and_costs are \"Nil Entry Free option\" + \"Estimated other investment costs\".",
|
||||
"---Example 1 Start---",
|
||||
"Management Fees and costs (A) \nOngoing Fee (% p.a.) ‡‡ (A)+(B) + (C) = (D) Total Fees and Costs \nInvestment fund \nEstimated Other \nEstimated \nEstimated \nEntry Fee \nNil Entry \nEntry Fee \noption* \nNil Entry \nFee option \n† \ninvestment costs \nPerformance \nfees (B) \nTransaction \ncosts (C) \noption \nFee option † \nOnePath International Shares \nIndex (Hedged) \n0.47 1.320.00 0.000.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.290.00 0.000.04 1.48 2.33\nPlatinum Asia** \n2.14 2.990.02 0.000.21 2.37 3.22\n",
|
||||
"---Example 1 End---",
|
||||
"For this case, although the table header is with disorder issue during PDF contents extraction issue.",
|
||||
"But the data points numbers order in data row (for example: 2.14 2.990.02 0.000.21 2.37 3.22) is correct as initial table structure.",
|
||||
"Please pay attention below information",
|
||||
"Assume the column sequence number is from 1.",
|
||||
"\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry Free option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees (B)\" values are as the column 4 numbers.",
|
||||
"For main fund: Platinum Asia with values: 2.14 2.990.02 0.000.21 2.37 3.22, ",
|
||||
"the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14(the column 1 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
||||
"the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99(the column 2 number) + 0.02 (the column 3 number), performance_fee is 0 (the column 4 number)",
|
||||
"Therefore, the output should be:",
|
||||
"{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee\": 0}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"keywords": ["Indirect costs \ni \nEstimated performance fees"],
|
||||
"prompts": ["Complex management fee and costs rule:",
|
||||
"If the table with columns:",
|
||||
"\"Management fee (% pa)\", \"Indirect costs\", \"Estimated performance fees\", \"Buy/sell spreads\"",
|
||||
"The management_fee is \"Management fee (% pa)\".",
|
||||
"The management_fee_costs is \"Management fee (% pa)\" + \"Indirect costs\".",
|
||||
"The performance_fee is \"Estimated performance fees\"",
|
||||
"The buy_spread and sell_spread are \"Buy/sell spreads\".",
|
||||
"---Example 1 Start---",
|
||||
"Indirect costs \ni\nEstimated performance fees \nii\nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nRecoverable \nexpenses \niii \nEstimated other \nindirect costs \nPerformance \nfees charged to \nthe Investment \nOption by \nunderlying \nmanagers \nPerformance \nfees charged by \ninterposed \nvehicles \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 \nMyNorth \nAustralian Fixed \nInterest Index \niv \n0.25 \n0.01 \n0.00 \n0.00 \n0.00 \n0.07 \n0.10/0.10 \nMyNorth \nInternational \nFixed Interest \nIndex - Hedged \n",
|
||||
"---Example 1 End---",
|
||||
"For this case: ",
|
||||
"a. The table header is with disorder issue during PDF contents extraction issue.",
|
||||
"b. The fund name is after the data row, e.g. MyNorth Australian Fixed Interest Index",
|
||||
"c. The data points numbers order in data row, for example: \n0.20 \n0.01 \n0.00 \n0.00 \n0.00 \n0.00 \n0.08/0.08 is correct as initial table structure.",
|
||||
"The 1st number: 0.20 is the management_fee, the 2nd number and the 3th number: 0.01 0.00 are the indirect costs, ",
|
||||
"the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ",
|
||||
"the 6th number: 0.00 is the transaction costs, ",
|
||||
"the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.",
|
||||
"The management_fee_and_costs is management_fee + indirect costs = 0.20 + 0.01 + 0.00= 0.21",
|
||||
"The output should be: ",
|
||||
"{\"data\": [{\"fund name\": \"MyNorth Australian Fixed Interest Index\", \"share name\": \"MyNorth Australian Fixed Interest Index\", \"management_fee_and_costs\": 0.21, \"management_fee\": 0.20, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth International Fixed Interest Index - Hedged\", \"share name\": \"MyNorth International Fixed Interest Index - Hedged\", \"management_fee_and_costs\": 0.26, \"management_fee\": 0.25, \"performance_fee\": 0.00, \"buy_spread\": 0.1, \"sell_spread\": 0.1}]}",
|
||||
"\n",
|
||||
"---Example 2 Start---",
|
||||
"Indirect costs \ni \nEstimated performance fees \nii \nInvestment \nOption \nManagement \nfee \n(% pa) \ni \n(% pa) \n(% pa) \nTransactions \ncosts \n(% pa) \nBuy/sell spreads \n(%) \nMyNorth Index \nModerately \nDefensive \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 \nMyNorth Index \nBalanced \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.09/0.09 \n",
|
||||
"---Example 2 End---",
|
||||
"For this case: ",
|
||||
"a. The table header is with disorder issue during PDF contents extraction issue.",
|
||||
"b. The fund name is before the data row, e.g. MyNorth Index Moderately \nDefensive",
|
||||
"c. The data points numbers order in data row, for example: \n0.55 \n0.00 \n0.00 \n0.00 \n0.00 \n0.01 \n0.08/0.08 is correct as initial table structure.",
|
||||
"The 1st number: 0.55 is the management_fee, the 2nd number and the 3th number: 0.00 0.00 are the indirect costs, ",
|
||||
"the 4th number: 0.00 is the performance_fee, the 5th number: 0.00 is the performance_fee by interposed vehicles, ",
|
||||
"the 6th number: 0.01 is the transaction costs, ",
|
||||
"the 7th number: 0.08 is the buy_spread, the 8th number: 0.08 is the sell_spread.",
|
||||
"The management_fee_and_costs is management_fee + indirect costs = 0.55 + 0.00 + 0.00= 0.55",
|
||||
"The output should be: ",
|
||||
"{\"data\": [{\"fund name\": \"MyNorth Index Moderately Defensive\", \"share name\": \"MyNorth Index Moderately Defensive\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.08, \"sell_spread\": 0.08}, {\"fund name\": \"MyNorth Index Balanced\", \"share name\": \"MyNorth Index Balanced\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55, \"performance_fee\": 0.00, \"buy_spread\": 0.09, \"sell_spread\": 0.09}]}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"keywords": ["Retirement and TTR income streams"],
|
||||
"prompts": ["Complex management fee and costs rule:",
|
||||
"For management_fee_and_costs, ",
|
||||
"a. If the title is \"Retirement and TTR income streams\"",
|
||||
"it means each investment name is with two fund names, one is for Retirement as pension, another is for TTR.",
|
||||
"For example, if the investment name is \"Defensive Growth\", the Retirement fund name is \"Defensive Growth Pension\", the TTR fund name is \"Defensive Growth TTR\".",
|
||||
"b. If the title is \"Retirement income stream only\"",
|
||||
"it means each investment name is with only one fund name, it is for Retirement as pension.",
|
||||
"For example, if the investment name is \"Lifestyle Growth\", the Retirement fund name is \"Lifestyle Growth Pension\".",
|
||||
"c. If the title is \"TTR income stream only\"",
|
||||
"it means each investment name is with only one fund name, it is for TTR.",
|
||||
"For example, if the investment name is \"Balanced\", the TTR fund name is \"Balanced TTR\".",
|
||||
"---Example 1 Start---",
|
||||
"Retirement and TTR income streams \nInvestment fees \nand costs \n1,2,3,4,6 \n0.55% p.a. for Defensive Growth, 0.37% p.a. for International \nShares \nRetirement income stream only \n0.80% p.a. for Lifestyle Growth \nTTR income stream only \n0.77% p.a. for Growth",
|
||||
"---Example 1 End---",
|
||||
"The output should be:",
|
||||
"{\"data\": [{\"fund name\": \"Defensive Growth Pension\", \"share name\": \"Defensive Growth Pension\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"Defensive Growth TTR\", \"share name\": \"Defensive Growth TTR\", \"management_fee_and_costs\": 0.55, \"management_fee\": 0.55}, {\"fund name\": \"International Shares Pension\", \"share name\": \"International Shares Pension\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"International Shares TTR\", \"share name\": \"International Shares TTR\", \"management_fee_and_costs\": 0.37, \"management_fee\": 0.37}, {\"fund name\": \"Lifestyle Growth Pension\", \"share name\": \"Lifestyle Growth Pension\", \"management_fee_and_costs\": 0.80, \"management_fee\": 0.80}, {\"fund name\": \"Growth TTR\", \"share name\": \"Growth TTR\", \"management_fee_and_costs\": 0.77, \"management_fee\": 0.77}]}"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"special_cases": {
|
||||
|
|
|
|||
86
main.py
86
main.py
|
|
@ -1037,15 +1037,15 @@ def batch_run_documents(
|
|||
output_mapping_child_folder: str = r"/data/emea_ar/output/mapping_data/docs/",
|
||||
output_mapping_total_folder: str = r"/data/emea_ar/output/mapping_data/total/",
|
||||
drilldown_folder: str = r"/data/emea_ar/output/drilldown/",
|
||||
re_run_extract_data: bool = True,
|
||||
re_run_mapping_data: bool = True,
|
||||
force_save_total_data: bool = False
|
||||
):
|
||||
sample_document_list_folder = r"./sample_documents/"
|
||||
document_list_files = glob(sample_document_list_folder + "*.txt")
|
||||
page_filter_ground_truth_file = (
|
||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||
)
|
||||
re_run_extract_data = False
|
||||
re_run_mapping_data = False
|
||||
force_save_total_data = True
|
||||
calculate_metrics = False
|
||||
|
||||
extract_way = "text"
|
||||
|
|
@ -1504,24 +1504,29 @@ if __name__ == "__main__":
|
|||
|
||||
# special_doc_id_list = ["553242411"]
|
||||
|
||||
re_run_extract_data = True
|
||||
re_run_mapping_data = True
|
||||
force_save_total_data = True
|
||||
doc_source = "aus_prospectus"
|
||||
# doc_source = "emea_ar"
|
||||
if doc_source == "aus_prospectus":
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_100_documents_multi_fund_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
)
|
||||
# document_sample_file = (
|
||||
# r"./sample_documents/aus_prospectus_52_documents_sample.txt"
|
||||
# r"./sample_documents/aus_prospectus_17_documents_sample.txt"
|
||||
# )
|
||||
document_sample_file = (
|
||||
r"./sample_documents/aus_prospectus_29_documents_sample.txt"
|
||||
)
|
||||
with open(document_sample_file, "r", encoding="utf-8") as f:
|
||||
special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()]
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/from_2024_documents/aus_100_document_prospectus_multi_fund.xlsx"
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/biz_rule/phase1_document_mapping.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
# special_doc_id_list: list = ["412778803"]
|
||||
# document_mapping_file = r"/data/aus_prospectus/basic_information/17_documents/aus_prospectus_17_documents_mapping.xlsx"
|
||||
document_mapping_file = r"/data/aus_prospectus/basic_information/29_documents/aus_prospectus_29_documents_mapping.xlsx"
|
||||
# special_doc_id_list: list = ["441280757"]
|
||||
# special_doc_id_list: list = ["401212184"]
|
||||
pdf_folder: str = r"/data/aus_prospectus/pdf/"
|
||||
output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/"
|
||||
output_extract_data_child_folder: str = (
|
||||
|
|
@ -1549,65 +1554,18 @@ if __name__ == "__main__":
|
|||
output_mapping_child_folder=output_mapping_child_folder,
|
||||
output_mapping_total_folder=output_mapping_total_folder,
|
||||
drilldown_folder=drilldown_folder,
|
||||
re_run_extract_data=re_run_extract_data,
|
||||
re_run_mapping_data=re_run_mapping_data,
|
||||
force_save_total_data=force_save_total_data
|
||||
)
|
||||
elif doc_source == "emea_ar":
|
||||
special_doc_id_list = [
|
||||
"292989214",
|
||||
"316237292",
|
||||
"321733631",
|
||||
"323390570",
|
||||
"327956364",
|
||||
"333207452",
|
||||
"334718372",
|
||||
"344636875",
|
||||
"362246081",
|
||||
"366179419",
|
||||
"380945052",
|
||||
"382366116",
|
||||
"387202452",
|
||||
"389171486",
|
||||
"391456740",
|
||||
"391736837",
|
||||
"394778487",
|
||||
"401684600",
|
||||
"402113224",
|
||||
"402181770",
|
||||
"402397014",
|
||||
"405803396",
|
||||
"445102363",
|
||||
"445256897",
|
||||
"448265376",
|
||||
"449555622",
|
||||
"449623976",
|
||||
"458291624",
|
||||
"458359181",
|
||||
"463081566",
|
||||
"469138353",
|
||||
"471641628",
|
||||
"476492237",
|
||||
"478585901",
|
||||
"478586066",
|
||||
"479042264",
|
||||
"479793787",
|
||||
"481475385",
|
||||
"483617247",
|
||||
"486378555",
|
||||
"486383912",
|
||||
"492121213",
|
||||
"497497599",
|
||||
"502693599",
|
||||
"502821436",
|
||||
"503194284",
|
||||
"506559375",
|
||||
"507967525",
|
||||
"508854243",
|
||||
"509845549",
|
||||
"520879048",
|
||||
"529925114",
|
||||
]
|
||||
special_doc_id_list = ["321733631"]
|
||||
batch_run_documents(
|
||||
doc_source=doc_source, special_doc_id_list=special_doc_id_list
|
||||
doc_source=doc_source,
|
||||
special_doc_id_list=special_doc_id_list,
|
||||
re_run_extract_data=re_run_extract_data,
|
||||
re_run_mapping_data=re_run_mapping_data,
|
||||
force_save_total_data=force_save_total_data
|
||||
)
|
||||
|
||||
# new_data_file = r"/data/emea_ar/output/mapping_data/total/mapping_data_info_15_documents_by_text_20241121154243.xlsx"
|
||||
|
|
|
|||
|
|
@ -11,6 +11,11 @@
|
|||
553449663
|
||||
528208796
|
||||
539266817
|
||||
539266874
|
||||
539266880
|
||||
526200514
|
||||
523516443
|
||||
526200513
|
||||
521606755
|
||||
557526129
|
||||
540028470
|
||||
|
|
@ -22,8 +27,3 @@
|
|||
527969661
|
||||
541356150
|
||||
555377021
|
||||
523516443
|
||||
539266874
|
||||
539266880
|
||||
526200514
|
||||
526200513
|
||||
Loading…
Reference in New Issue