From 8a5723c150f11fa5d1039ce7cbd247b0af2ff19a Mon Sep 17 00:00:00 2001 From: Blade He Date: Thu, 27 Mar 2025 21:10:33 -0500 Subject: [PATCH] optimize for Entry Fee/ Nil Entry case --- core/data_extraction.py | 23 ++++-- .../data_extraction_prompts_config.json | 73 +++++++++++------ main.py | 78 ++++++++++--------- 3 files changed, 106 insertions(+), 68 deletions(-) diff --git a/core/data_extraction.py b/core/data_extraction.py index 0a14620..0a1b297 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -1635,6 +1635,7 @@ class DataExtraction: if page_text is not None and len(page_text) > 0: logger.info(f"Transfer previous page fund name: {page_text} to be the pre-fix of page text") summary += f"\nThe last fund name of previous PDF page: {page_text}\n" + summary += "If could find the fund name for the first data point value, please ignore this fund name.\n" else: summary = self.instructions_config.get("summary", "\n") @@ -1646,7 +1647,7 @@ class DataExtraction: instructions.extend(image_features) instructions.append("\n") - instructions.append("## Datapoints Reported name:\n") + instructions.append("## Datapoints Reported name\n") instructions.append("Please look for relevant reported names and similar variations in the context.\n") reported_name_info_in_instructions = self.instructions_config.get("reported_name", {}) for datapoint in datapoints: @@ -1746,7 +1747,7 @@ class DataExtraction: none_value_example_count += 1 instructions.append("\n") - instructions.append("## Data business features:\n") + instructions.append("## Data business features\n") data_business_features = self.instructions_config.get( "data_business_features", {} ) @@ -1754,7 +1755,7 @@ class DataExtraction: instructions.append(common) instructions.append("\n") - instructions.append("## Datapoints investment level:\n") + instructions.append("## Datapoints investment level\n") investment_level_info = data_business_features.get("investment_level", {}) for datapoint in datapoints: investment_level = investment_level_info.get(datapoint, "") @@ -1762,7 +1763,7 @@ class DataExtraction: instructions.append("\n") instructions.append("\n") - instructions.append("## Datapoints value range:\n") + instructions.append("## Datapoints value range\n") data_value_range_info = data_business_features.get("data_value_range", {}) for datapoint in datapoints: data_value_range = data_value_range_info.get(datapoint, "") @@ -1777,6 +1778,7 @@ class DataExtraction: complex_special_rule = data_business_features.get("sepcial_rule_by_keywords", "") with_special_rule_title = False found_sub_datapoints = [] + datapoint_special_rule = {} for datapoint in datapoints: # If some complex special rule is found, and with sub datapoints, # need not to load relevant rule again. @@ -1807,7 +1809,7 @@ class DataExtraction: complex_prompts_list = complex_special_rule.get("prompts", []) if len(complex_prompts_list) > 0: if not with_special_rule_title: - instructions.append("## Special rule:\n") + instructions.append("## Special rule\n") with_special_rule_title = True complex_prompts = "\n".join(complex_prompts_list) instructions.append(complex_prompts) @@ -1822,8 +1824,13 @@ class DataExtraction: continue special_rule_list = special_rule_info.get(datapoint, []) if len(special_rule_list) > 0: + datapoint_special_rule[datapoint] = special_rule_list + if len(list(datapoint_special_rule.keys())) > 0: + for datapoint, special_rule_list in datapoint_special_rule.items(): + if datapoint in found_sub_datapoints: + continue if not with_special_rule_title: - instructions.append("## Special rule:\n") + instructions.append("## Special rule\n") with_special_rule_title = True special_rule = "\n".join(special_rule_list) instructions.append(special_rule) @@ -1831,7 +1838,7 @@ class DataExtraction: instructions.append("\n") - instructions.append("## Special cases:\n") + instructions.append("## Special cases\n") special_cases = self.instructions_config.get("special_cases", {}) special_cases_common_list = special_cases.get("common", []) special_cases_number = 1 @@ -1859,7 +1866,7 @@ class DataExtraction: instructions.append(contents) instructions.append("\n") - instructions.append("## Output requirement:\n") + instructions.append("## Output requirement\n") output_requirement = self.instructions_config.get("output_requirement", {}) output_requirement_common_list = output_requirement.get("common", []) instructions.append("\n".join(output_requirement_common_list)) diff --git a/instructions/aus_prospectus/data_extraction_prompts_config.json b/instructions/aus_prospectus/data_extraction_prompts_config.json index cf2b9e3..8417eea 100644 --- a/instructions/aus_prospectus/data_extraction_prompts_config.json +++ b/instructions/aus_prospectus/data_extraction_prompts_config.json @@ -16,7 +16,7 @@ ], "data_business_features": { "common": [ - "General rules:", + "## General rules", "- 1. The data is in the context, perhaps in table(s), semi-table(s) or paragraphs.", "- 2. Fund name: ", "a. The full fund name should be main fund name + sub-fund name, e,g, main fund name is Black Rock European, sub-fund name is Growth, the full fund name is: Black Rock European Growth.", @@ -86,7 +86,8 @@ "---Example Start---", "Retirement account \n\nInvestment option \n(A) Investment fees \nand costs (including \n(B) performance \nfees) (pa)* \n(B) Performance \nfees (pa) \n# \n(C) Transaction \ncosts (pa)*^ \n(A) + (C) Total \ninvestment cost \n(pa) \nBalanced – Indexed 0.00% 0.00% 0.00% 0.00%\n", "---Example End---", - "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values." + "For this example, as \"Investment fees and costs (including (B) performance fees)\" and \"Performance fees (pa)\" mentioned as 0.00% so return 0 as datapoint values.", + "- 7. If for data point value specifically Nil is written in the value then return NULL('') for the same" ], "investment_level": { "total_annual_dollar_based_charges": "Total annual dollar based charges is share level data.", @@ -145,7 +146,7 @@ "management_fee_and_costs": [ "### Management fee and cost", "Management fee and cost = Management fee + indirect cost + recoverable expense (Also known as Expense recovery cost or recovery fee or Expense recovery fee or expense recoveries) + Manager fee or Responsible entity fee.", - "If there are multiple Management fee and costs reported names, here is the priority rule:", + "A. If there are multiple Management fee and costs reported names, here are the priority rules:", "A.1 With \"Total Management fees and costs (gross)\" and \"Total Management fees and costs (net)\", pick up the values from \"Total Management fees and costs (net)\".", "---Example 1 Start---", "\n Investment option \nInvestment option \nmanagement \ncosts1 \n% p.a. \n(A)\nLifeplan \nadministration fee \n(gross)2 \n% p.a. \n(B)\nLifeplan \nadministration fee \n(net) \n% p.a. \n(C)\nTotal Management \nfees and costs \n(gross) \n% p.a. \n(A + B)\nTotal Management \nfees and costs \n(net) \n% p.a. \n(A + C)\nAllan Gray Australian Equity Fund \u2013 Class A\n0.77\n0.60\n0.42\n1.37\n1.19\n", @@ -196,7 +197,7 @@ "The management_fee is the value of \"Management fee (% pa)\".", "The management_fee_and_costs is the value of \"Total management cost (% pa)\".", "---Example 1 Start---", - "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.000.04 0.000.01 1.38 0.31\n1.29 0.000.00 0.000.01 1.30 0.29\n", + "Fund/Investment\nOption\nManagement\nfee (% pa)\nEstimated \nPerformance \n-related \nfees \nEstimated\nother\nindirect\ncosts\nEstimated\nexpense\nrecoveries\nEstimated\nRegulatory\nChange\nExpense\nRecovery\nTotal\nmanagement\ncost (% pa)\nEstimated\nbuy-sell\nspread (%)\nBT Future \nGoals Fund \n1.33 0.00 0.04 0.00 0.01 1.38 0.31\n1.29 0.00 0.00 0.00 0.01 1.30 0.29\n", "---Example 1 End---", "The output should be:", "{\"data\": [{\"fund name\": \"BT Future Goals Fund\", \"share name\": \"BT Future Goals Fund\", \"management_fee_and_costs\": 1.38, \"management_fee\": 1.33, \"indirect_costs\": 0.04, \"recoverable_expenses\": 0, \"change_recoverable_expenses\": 0.01, \"performance_fee_costs\": 0, \"buy_spread\": 0.31, \"sell_spread\": 0.31}]}", @@ -225,6 +226,7 @@ "---Example 3 Start---", "Fund name \nManagement \nfees and costs \n(p.a.) 1 \nBuy/sell \nspread \n(%) 2 \nLOWER VOLATILITY SHARE \nFirst Sentier Wholesale Equity \nIncome Fund \n1.22% 0.05\nFirst Sentier Wholesale Geared \nShare Fund 3 \n1.04%(g)/2.18%(n) 4 0.20–0.50 5 \n\n", "---Example 3 End---", + "For value: 1.04%(g)/2.18%(n), (g) means gross, (n) means net, please extract net value: 2.18", "The output should be:", "{\"data\": [{\"fund name\": \"First Sentier Wholesale Equity Income Fund\", \"share name\": \"First Sentier Wholesale Equity Income Fund\", \"management_fee_and_costs\": 1.22, \"management_fee\": 1.22, \"buy_spread\": 0.05, \"sell_spread\": 0.05}, {\"fund name\": \"First Sentier Wholesale Geared Share Fund\", \"share name\": \"First Sentier Wholesale Geared Share Fund\", \"management_fee_and_costs\": 2.18, \"management_fee\": 2.18, \"buy_spread\": 0.5, \"sell_spread\": 0.5}]}", "\n", @@ -291,16 +293,6 @@ "For fund: Managed Growth, the value 0.38, including 0.11 Performance fee, so the Management costs is 0.38 - 0.11 = 0.27, performance_fee_costs is 0.11.", "So the output should be:", "{\"data\": [{\"fund name\": \"MySuper/Balanced\", \"share name\": \"MySuper/Balanced\", \"management_fee_and_costs\": 0.29, \"management_fee\": 0.29, \"performance_fee_costs\": 0.09}, {\"fund name\": \"Managed Growth\", \"share name\": \"Managed Growth\", \"management_fee_and_costs\": 0.27, \"management_fee\": 0.27, \"performance_fee_costs\": 0.11}]}", - "---Example 4 Start---", - "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", - "---Example 4 End---", - "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", - "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", - "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", - "If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.", - "If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.", - "So the output should be:", - "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}", "\n", "I. If exist **\"Maximum management fee\"** in context, please ignore relevant values.", "---Example Start---", @@ -407,7 +399,7 @@ "---Example 4 Start---", "Fees and costs summary\n\nHostplus Superannuation and Personal Super Plan \n\nType of fee \nAmount \nHow and when paid \nOngoing annual fees and costs1 \nAdministration \nfees and costs \n$78.00 p.a. \n($1.50 per week) \nplus $32.24 p.a. \nDeducted monthly from \nyour account. \nDeducted from the Fund’s \nAdministration Reserve \nthroughout the year (and \nnot from your account). \nplus trustee fee \nof 0.0165% p.a. \nof your account \nbalance. \n", "---Example 4 End---", - "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees.", + "Attention: about plus trustee fee of 0.0165% p.a. of your account balance., it's only part of administration_fees, missing the \"first\" part, so please ignore the 0.0165% as administration_fees, only output total_annual_dollar_based_charges as 78.", "B. The administration fee and costs/ total annual dollar-based charges are with production name, other data points/ values are with specific fund/ share name(s).", "---Example Start---", "My Super \nType of fee or cost Amount How and when paid \nOngoing annual fees and costs 1 \nAdministration fees and costs \n$26.00 p.a. \nplus \n0.17% p.a. of account balance (subject to a \nmaximum of $1,000 p.a.) \n$0.50 per week deducted from your account\nbalance at the end of each month or on exit.\nPercentage fee taken into account in the \ndaily calculation of unit prices. \nInvestment fees and costs \n2 \nOption % of option’s assets* \nFund1 0.12%\n", @@ -520,7 +512,7 @@ "a. For this example, there is pure \"Performance fees\", please extract relevant values as performance_fee_costs.", "b. This example mentioned share classes, please output according to share class.", "The output should be", - "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"C Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"E Class\", \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", + "{\"data\": [{\"fund name\": \"Platinum International Fund\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.15}, {\"fund name\": \"Platinum Global Fund (Long Only)\", \"share name\": \"P Class\", \"performance_fee_costs\": 0.24}]}", "D. Identify the value of performance fee and if it is written 0% or 0.00% or 0 or 0.00 then extract the same as 0 do not assume null for the same and return its values as 0", "---Example Start---", "Fund/Investment Option \nManagement Fees \nand Costs \n(% pa) \n1 \nPerformance Fees 2 \n(% pa) \nTransaction Costs 3 \n(% pa) \nBT American Share Fund 1.08 0.00 0.00\nBT Asian Share Fund 1.10 0.00 0.10", @@ -692,6 +684,7 @@ { "keywords": ["Administration fees \nEstimated administration costs \nInvestment fees"], "keywords_is_regex": false, + "sub_datapoints": ["administration_fees", "performance_fee_costs"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -715,6 +708,7 @@ { "keywords": ["Entry Fee option \nNil Entry option"], "keywords_is_regex": false, + "sub_datapoints": ["performance_fee_costs"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -727,14 +721,27 @@ "---Example 1 Start---", "\nInvestment fund \nEntry Fee option \nNil Entry option \nEstimated Other investment costs \nEstimated Performance fees \nOther 1 \nOther 2 \nOther 3 \nOnePath International Shares \nIndex (Hedged) \n0.47 1.32 0.00 0.00 0.00 0.47 1.32\nPendal Concentrated Global \nShares Hedged II \n1.44 2.29 0.00 0.00 0.04 1.48 2.33\nPlatinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n", "---Example 1 End---", - "The data points numbers order in data row (for example: 2.14 2.99 0.02 0.00 0.21 2.37 3.22) is correct as initial table structure.", "Please pay attention below information", - "Assume the numeric column sequence number is from 1.", - "\"Entry Fee option\" values are as the column 1 numbers, \"Nil Entry option\" values are as the column 2 numbers, \"Estimated other investment costs\" values are as the column 3 numbers, \"Estimated Performance fees\" values are as the column 4 numbers.", - "For main fund: Platinum Asia with values: 2.14 2.99 0.02 0.00 0.21 2.37 3.22, ", - "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (the column 1 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", - "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (the column 2 number) + 0.02 (the column 3 number), performance_fee_costs is 0 (the column 4 number)", + "Assume the numeric column sequence is from 1.", + "\"Entry Fee option\" values are as the 1st column values, \"Nil Entry option\" values are as the 2nd column values, \"Estimated other investment costs\" values are as the 3rd column values, \"Estimated Performance fees\" values are as the 4th column values.", + "Here is the example to get data, step by step.", + "For this fund in Example:", + "Platinum Asia** \n2.14 2.99 0.02 0.00 0.21 2.37 3.22\n", + "Step 1 Get new fund name", + "Combine \"Platinum Asia\" with \"Entry Fee\" as \"Platinum Asia Entry Fee\"", + "Combine \"Platinum Asia\" with \"Nil Entry\" as \"Platinum Asia Nil Entry\"", + "Step 2 **EXCLUE the values of the last three columns of data.**", + "ONLY KEEP these 4 values: 2.14 2.99 0.02 0.00 for next steps", + "Step 3 Calculate management_fee and management_fee_and_costs for these 2 new funds:", + "the fund: Platinum Asia Entry Fee, both of management_fee and management_fee_and_costs should be 2.16 = 2.14 (Value of 1st column) + 0.02 (Value of 3rd column)", + "the fund: Platinum Asia Nil Entry, both of management_fee and management_fee_and_costs should be 3.01 = 2.99 (Value of 2nd column) + 0.02 (Value of 3rd column)", + "**Make sure don't take \"Estimated other investment costs\" value from the wrong column!!!**", + "Step 4 Get performance_fee_costs", + "the fund: Platinum Asia Entry Fee, performance_fee_costs is 0 (Value of 4th column)", + "the fund: Platinum Asia Nil Entry, performance_fee_costs is 0 (Value of 4th column)", "Identify the value of the column \"Estimated Performance fees\" and if it is written 0.00 then extract the same as 0 do not assume nil for the same and return its values as 0", + "**Make sure don't take \"Estimated Performance fees\" value from the wrong column!!!**", + "Please ignore the last fund name of previous PDF page, and extract data as these 4 steps for all of records in Context.", "Therefore, the output should be:", "{\"data\": [{\"fund name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"share name\": \"OnePath International Shares Index (Hedged) Entry Fee\", \"management_fee_and_costs\": 0.47, \"management_fee\": 0.47, \"performance_fee_costs\": 0},{\"fund name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"share name\": \"OnePath International Shares Index (Hedged) Nil Entry\", \"management_fee_and_costs\": 1.32, \"management_fee\": 1.32, \"performance_fee_costs\": 0}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Entry Fee\", \"management_fee_and_costs\": 1.44, \"management_fee\": 1.44, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"share name\": \"Pendal Concentrated Global Shares Hedged II Nil Entry\", \"management_fee_and_costs\": 2.29, \"management_fee\": 2.29, \"performance_fee_costs\": 0}]}, {\"fund name\": \"Platinum Asia Entry Fee\", \"share name\": \"Platinum Asia Entry Fee\", \"management_fee_and_costs\": 2.16, \"management_fee\": 2.16, \"performance_fee_costs\": 0}, {\"fund name\": \"Platinum Asia Nil Entry\", \"share name\": \"Platinum Asia Nil Entry\", \"management_fee_and_costs\": 3.01, \"management_fee\": 3.01, \"performance_fee_costs\": 0}" ] @@ -765,6 +772,7 @@ { "keywords": ["Recoverable expenses \nEstimated other indirect costs"], "keywords_is_regex": false, + "sub_datapoints": ["performance_fee_costs", "interposed_vehicle_performance_fee_cost", "buy_spread", "sell_spread"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -809,6 +817,7 @@ { "keywords":["Plus other investment fees and costs \nEquals investment fees and costs"], "keywords_is_regex": false, + "sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"], "prompts": [ "### Complex management fee and costs rule", "If the table with columns:", @@ -827,7 +836,7 @@ ] }, { - "keywords":["Total\\s*administration\\s*and investment\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?Investment\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], + "keywords":["Total\\s*administration\\s*and (management|investment)\\s*fees[\\s\\S]*?Administration\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], "keywords_is_regex": true, "sub_datapoints": ["administration_fees", "performance_fee_costs", "buy_spread", "sell_spread"], "prompts": [ @@ -853,6 +862,24 @@ "The output should be:", "{\"data\": [{\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Defensive\", \"management_fee_and_costs\": 0.74, \"management_fee\": 0.74, \"administration_fees\": 0.2, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Multi-Manager Multi-Sector\", \"share name\": \"CFS Conservative\", \"management_fee_and_costs\": 0.81, \"management_fee\": 0.81, \"administration_fees\": 0.20, \"performance_fee_costs\": 0.03, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" ] + }, + { + "keywords":["Total\\s*of\\s*(management|investment)\\s*fees\\s*and\\s*costs\\s*and\\s*performance\\s*fees[\\s\\S]*?(Management|Investment)\\s*fees[\\s\\S]*?Performance\\s*fee[\\s\\S]*?Buy\\/[sS]ell\\s*spread"], + "keywords_is_regex": true, + "sub_datapoints": ["performance_fee_costs", "buy_spread", "sell_spread"], + "prompts": [ + "### Complex management fee and costs rule", + "---Example Start---", + "Fund name \nTotal of management \nfees and costs and \nperformance \nfees (% p.a.) \n= \nManagement \nfees and costs \n(% p.a.) \n+ \nPerformance \nfee (% p.a.) \nBuy/sell \nspread \nCFS Real Return – Class A 1 \n0.87% \n0.87% \n0.15% \nCFS Defensive Builder \n0.68% \n0.67% \n0.01% \n0.15% \n", + "---Example End---", + "The column: \"Total of management fees and costs and performance fees (% p.a.)\", meaning the value is the sum of \"Management fee and costs\" and \"performance fee\", We should ignore this column values.", + "The column \"Management fees and costs (% p.a.)\" is the value of \"Management fee and costs\".", + "Both of management_fee and management_fee_and_costs are the values for \"Management fees and costs (% p.a.)\" for this case.", + "If there are 3 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the buy_spread and sell_spread.", + "If there are 4 decimal numbers, the 2nd decimal number is the management_fee_and_costs and management_fee, the 3rd decimal number is the performance_fee_costs, the 4th decimal number is buy_spread and sell_spread.", + "So the output should be:", + "{\"data\": [{\"fund name\": \"CFS Real Return – Class A\", \"share name\": \"CFS Real Return – Class A\", \"management_fee_and_costs\": 0.87, \"management_fee\": 0.87, \"buy_spread\": 0.15, \"sell_spread\": 0.15}, {\"fund name\": \"CFS Defensive Builder\", \"share name\": \"CFS Defensive Builder\", \"management_fee_and_costs\": 0.67, \"management_fee\": 0.67, \"performance_fee_costs\": 0.01, \"buy_spread\": 0.15, \"sell_spread\": 0.15}]}" + ] } ] } diff --git a/main.py b/main.py index 7534026..324c613 100644 --- a/main.py +++ b/main.py @@ -1522,8 +1522,8 @@ if __name__ == "__main__": # get_aus_prospectus_document_category() - re_run_extract_data = True - re_run_mapping_data = True + re_run_extract_data = False + re_run_mapping_data = False force_save_total_data = True doc_source = "aus_prospectus" # doc_source = "emea_ar" @@ -1531,42 +1531,46 @@ if __name__ == "__main__": # document_sample_file = ( # r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt" # ) - document_sample_file = ( - r"./sample_documents/aus_prospectus_46_documents_sample.txt" - ) - with open(document_sample_file, "r", encoding="utf-8") as f: - special_doc_id_list = [doc_id.strip() for doc_id in f.readlines()] - # special_doc_id_list = ["448576924"] - pdf_folder: str = r"/data/aus_prospectus/pdf/" - output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" - output_extract_data_child_folder: str = ( - r"/data/aus_prospectus/output/extract_data/docs/" - ) - output_extract_data_total_folder: str = ( - r"/data/aus_prospectus/output/extract_data/total/" - ) - output_mapping_child_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/docs/" - ) - output_mapping_total_folder: str = ( - r"/data/aus_prospectus/output/mapping_data/total/" - ) - drilldown_folder = r"/data/aus_prospectus/output/drilldown/" + document_sample_file_list = [ + r"./sample_documents/aus_prospectus_46_documents_sample.txt", + r"./sample_documents/aus_prospectus_verify_6_documents_sample.txt", + ] + for document_sample_file in document_sample_file_list: + logger.info(f"Start to run document sample file: {document_sample_file}") + with open(document_sample_file, "r", encoding="utf-8") as f: + special_doc_id_list = [doc_id.strip() for doc_id in f.readlines() + if len(doc_id.strip()) > 0] + # special_doc_id_list = ["401212184"] + pdf_folder: str = r"/data/aus_prospectus/pdf/" + output_pdf_text_folder: str = r"/data/aus_prospectus/output/pdf_text/" + output_extract_data_child_folder: str = ( + r"/data/aus_prospectus/output/extract_data/docs/" + ) + output_extract_data_total_folder: str = ( + r"/data/aus_prospectus/output/extract_data/total/" + ) + output_mapping_child_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/docs/" + ) + output_mapping_total_folder: str = ( + r"/data/aus_prospectus/output/mapping_data/total/" + ) + drilldown_folder = r"/data/aus_prospectus/output/drilldown/" - batch_run_documents( - doc_source=doc_source, - special_doc_id_list=special_doc_id_list, - pdf_folder=pdf_folder, - output_pdf_text_folder=output_pdf_text_folder, - output_extract_data_child_folder=output_extract_data_child_folder, - output_extract_data_total_folder=output_extract_data_total_folder, - output_mapping_child_folder=output_mapping_child_folder, - output_mapping_total_folder=output_mapping_total_folder, - drilldown_folder=drilldown_folder, - re_run_extract_data=re_run_extract_data, - re_run_mapping_data=re_run_mapping_data, - force_save_total_data=force_save_total_data - ) + batch_run_documents( + doc_source=doc_source, + special_doc_id_list=special_doc_id_list, + pdf_folder=pdf_folder, + output_pdf_text_folder=output_pdf_text_folder, + output_extract_data_child_folder=output_extract_data_child_folder, + output_extract_data_total_folder=output_extract_data_total_folder, + output_mapping_child_folder=output_mapping_child_folder, + output_mapping_total_folder=output_mapping_total_folder, + drilldown_folder=drilldown_folder, + re_run_extract_data=re_run_extract_data, + re_run_mapping_data=re_run_mapping_data, + force_save_total_data=force_save_total_data + ) elif doc_source == "emea_ar": special_doc_id_list = ["321733631"] batch_run_documents(