From 352886ade26a55ad31e4b4c60158a2c240452acc Mon Sep 17 00:00:00 2001 From: Blade He Date: Mon, 2 Dec 2024 11:45:19 -0600 Subject: [PATCH] update instructions for TER, OGC, Performance Fees --- configuration/datapoint_reported_name.json | 2 + .../data_extraction_prompts_config.json | 39 ++++++++++++++++++ main.py | 4 +- playground.ipynb | 41 +++++++++++++++++++ 4 files changed, 84 insertions(+), 2 deletions(-) diff --git a/configuration/datapoint_reported_name.json b/configuration/datapoint_reported_name.json index 1d91860..7e1842b 100644 --- a/configuration/datapoint_reported_name.json +++ b/configuration/datapoint_reported_name.json @@ -35,6 +35,7 @@ "Ratio de gastos totales" ], "german": [ + "TER inkl. Performance-Fee in % **)", "Gesamtgebühren", "Kostenpauschale", "Gesamtkostenquote", @@ -242,6 +243,7 @@ "Gastos corrientes en porcentaje 3)" ], "german": [ + "Ongoing Charges inkl.Performance-Fee in % **)", "Laufende Kosten", "Laufende Kosten in Prozent", "Laufende Kosten 1)", diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 6d11b18..6594dde 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -68,12 +68,19 @@ "- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".", "- With \"TER (en %) (with performance)\" and \"TER(en %) (without performance)\", pick up the values from \"TER (en %) (with performance)\".", "- With \"TER including Performance Fee\" and \"TER excluding Performance Fee\", pick up the values from \"TER including Performance Fee\".", + "- With \"TER inkl. Performance-Fee in % **)\" and \"TER exkl. Performance-Fee in % **)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".", + "- With \"TER inkl. Performance-Fee in % **)\" and \"TER inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"TER inkl. Performance-Fee in % **)\".", "- With both of \"Synthetic TER\" and \"Fund TER\", if \"Synthetic TER\" with value(s), pick up the value(s) from \"Synthetic TER\", otherwise, pick up the value(s) from \"Fund TER\".", "- With both of \"Net TER\" and \"Capped Expense Ratio\", the priority is \"Net TER\", please exclude the column: \"Capped Expense Ratio\", only pick up the values from \"Net TER\".", "- With \"Gross TER\", \"Waiver\", \"Net TER\", \"Capped Expense Ratio\" as column titles, pick up the values from \"Net TER\".", "- If exist Gross TER as column title, please ignore this title", "Please ignore TER values which with the exception of performance fees or excluded performance fees." ], + "ogc": [ + "If there are multiple OGC value columns, here is the priority rules:", + "- With \"Ongoing Charges inkl. Performance-Fee in % **)\" and \"Ongoing Charges exkl. Performance-Fee in % **)\", pick up the values from \"Ongoing Charges inkl. Performance-Fee in % **)\".", + "- With \"Ongoing Charges inkl. Performance-Fee in % **)\" and \"Ongoing Charges inkl. Performance-Fee in % (inkl. Zielfonds)\", pick up the values from \"Ongoing Charges inkl. Performance-Fee in % **)\"." + ], "performance_fee": [ "The performance fees should not be the presence of the rates at which the performance fees are calculated." ] @@ -162,12 +169,20 @@ "The performance fees should be:", "TER including performance fees - TER excluding performance fees.", "Here is the example:", + "Example 1:", "-----Example Start-----", "GAMAX FUNDS FCP\\nClass\\nTER (excluding Performance Fees)\\nTER (including Performance Fees)\\nGAMAX FUNDS - ASIA PACIFIC\\nA\\n2.07%\\n2.07%\\n", "-----Example End-----", "The output should be:", "{\"data\": [{\"fund name\": \"GAMAX FUNDS - ASIA PACIFIC\", \"share name\": \"A\", \"ter\": 2.07, \"performance_fee\": 0}]}", "The performance fees value is TER (including Performance Fees) - TER (excluding Performance Fees) = 2.07 - 2.07 = 0", + "Example 2:", + "-----Example Start-----", + "D/S Strategie ausgewogen\\nErgänzende Angaben für Anleger in der Schweiz zum 31. Dezember 2020 (ungeprüft)\\nFonds\\nTER exkl. \\nPerformance-Fee in % **)\\nTER inkl. \\nPerformance-Fee in % **)\\nTER inkl. \\nPerformance-Fee in % (inkl. Zielfonds)\\n1,15\\n1,63\\n1,15\\n1,63\\nTER exkl.\\nPerformance-Fee in % (inkl. Zielfonds)", + "-----Example End-----", + "The output should be:", + "{\"data\": [{\"fund name\": \"D/S Strategie ausgewogen\", \"ter\": 1.15, \"performance_fee\": 0}]}", + "The performance fees value is TER inkl. Performance-Fee in % **) - TER exkl. Performance-Fee in % **) = 1,15 - 1,15 = 0", "Case 2:", "If some table is with three value columns: \"TER including performance fees\", \"TER excluding performance fees\", \"Performance fees\", ", "The Performance fees value in column: Performance fees, chould be \"-\", because of \"TER including performance fees\" - \"TER excluding performance fees\" = 0, ", @@ -184,6 +199,30 @@ "The output should be:", "{\"data\": [{\"fund name\": \"Pictet Corto Europe Long Short\", \"share name\": \"P EUR\", \"ter\": 1.98, \"performance_fee\": 0}]}" ] + }, + { + "title": "Performance fees is part of OGC:", + "contents": [ + "If exist both of \"Ongoing Charges including Performance Fee\" and \"Ongoing Charges excluding Performance Fee\",", + "The OGC should be \"Ongoing Charges including Performance Fee\".", + "The performance fees should be:", + "Ongoing Charges including Performance Fee - Ongoing Charges excluding Performance Fee.", + "Here is the example:", + "Example 1:", + "-----Example Start-----", + "GAMAX FUNDS FCP\\nClass\\Ongoing Charges (excluding Performance Fees)\\Ongoing Charges (including Performance Fees)\\nFund 1\\nShare A\\n1.50%\\n1.58%\\n", + "-----Example End-----", + "The output should be:", + "{\"data\": [{\"fund name\": \"Fund 1\", \"share name\": \"Share 1\", \"ogc\": 1.58, \"performance_fee\": 0.08}]}", + "The performance fees value is Ongoing Charges including Performance Fee - Ongoing Charges excluding Performance Fee = 1.58 - 1.50 = 0.08", + "Example 2:", + "-----Example Start-----", + "Fund1\\nOngoing Charges exkl. \\nPerformance-Fee in % **)\\nOngoing Charges exkl. \\nPerformance-Fee in % (inkl. Zielfonds)\\nOngoing Charges inkl. \\nPerformance-Fee in % **)\\nOngoing Charges inkl. \\nPerformance-Fee in % (inkl. Zielfonds)\\n1,15\\n1,35\\n1,20\\n1,35\\n", + "-----Example End-----", + "The output should be:", + "{\"data\": [{\"fund name\": \"Fund1\", \"ogc\": 1.20, \"performance_fee\": 0.05}]}", + "The performance fees value is Ongoing Charges inkl. Performance-Fee in % **) - Ongoing Charges exkl. Performance-Fee in % **) = 1.20 - 1.15 = 0.05" + ] } ] }, diff --git a/main.py b/main.py index 71ecd84..ffb44f6 100644 --- a/main.py +++ b/main.py @@ -1194,12 +1194,12 @@ if __name__ == "__main__": "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["334584772"] + special_doc_id_list = ["334584772"] output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"] diff --git a/playground.ipynb b/playground.ipynb index 7faf7de..3504fa4 100644 --- a/playground.ipynb +++ b/playground.ipynb @@ -572,6 +572,47 @@ "list(document_mapping[\"ShareClassName\"].unique())" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pymupdf4llm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing ./data/emea_ar/pdf/501380553.pdf...\n", + "[ ] (0/47[ ] ( 1/47[= ] ( 2/4[== ] ( 3/47[=== ] ( 4/4[==== ] ( 5/47[===== ] ( 6/47[===== ] ( 7/4[====== ] ( 8/47[======= ] ( 9/4[======== ] (10/47[========= ] (11/4[========== ] (12/47[=========== ] (13/47[=========== ] (14/4[============ ] (15/47[============= ] (16/4[============== ] (17/47[=============== ] (18/4[================ ] (19/47[================= ] (20/47[================= ] (21/4[================== ] (22/47[=================== ] (23/4[==================== ] (24/47[===================== ] (25/4[====================== ] (26/4[====================== ] (27/47[======================= ] (28/4[======================== ] (29/47[========================= ] (30/4[========================== ] (31/47[=========================== ] (32/4[============================ ] (33/4[============================ ] (34/47[============================= ] (35/4[============================== ] (36/47[=============================== ] (37/4[================================ ] (38/47[================================= ] (39/4[================================== ] (40/4[================================== ] (41/47[=================================== ] (42/4[==================================== ] (43/47[===================================== ] (44/4[====================================== ] (45/47[======================================= ] (46/47[========================================] (47/47]\n" + ] + }, + { + "data": { + "text/plain": [ + "107851" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_text = pymupdf4llm.to_markdown(r\"./data/emea_ar/pdf/501380553.pdf\")\n", + "\n", + "# now work with the markdown text, e.g. store as a UTF8-encoded file\n", + "import pathlib\n", + "pathlib.Path(r\"./data/emea_ar/output/markdown/501380553.md\").write_bytes(md_text.encode())" + ] + }, { "cell_type": "code", "execution_count": null,