1. Set TOR reported name priority

2. Optimize investment mapping logic
2024-12-06 09:54:43 -06:00 · 2024-12-06 09:54:43 -06:00 · a25991e2bb
parent 95c386911c
commit a25991e2bb
5 changed files with 56 additions and 14 deletions
--- a/core/data_extraction.py
+++ b/core/data_extraction.py
@ -531,14 +531,16 @@ class DataExtraction:
                    modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
                    if len(modified_fund_name.split()) > 1:
                        fund_name = modified_fund_name
                fund_name = self.get_fund_name(fund_name, "Fund")
                fund_name = self.get_fund_name(fund_name, "Bond")
            remove_list = ["Market Specific Equity Sub-Funds", 
-                           "International and Regional Equity Sub-Funds",
+                           "International and Regional Equity Sub-Funds",   
                           "Equity Sub-Funds"]
            for remove_item in remove_list:
                if fund_name.startswith(remove_item):
                    fund_name = fund_name.replace(remove_item, "").strip()
-            fund_name = self.get_fund_name(fund_name, "Fund")
+            
            fund_name = self.get_fund_name(fund_name, "Bond")
            data["fund name"] = fund_name
            # Clean fund name end
@ -606,6 +608,8 @@ class DataExtraction:
    def get_fund_name(self, fund_name: str, fund_feature: str):
        if not fund_name.endswith(fund_feature):
            return fund_name
        # to avoid split funds to fund s
        fund_feature = fund_feature + " "
        fund_name_split = fund_name.split(fund_feature)
        if len(fund_name_split) > 1:
            last_fund = fund_name_split[-1].strip()
--- a/instructions/data_extraction_prompts_config.json
+++ b/instructions/data_extraction_prompts_config.json
@ -68,6 +68,10 @@
 			"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56"
 		},
 		"special_rule": {
 			"tor": [
 				"If there are multiple TOR reported names, here is the priority rules:",
 				"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"."
 			],
 			"ter": [
 				"If there are multiple TER value columns, here is the priority rules:",
 				"- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".",
--- a/main.py
+++ b/main.py
@ -1151,7 +1151,7 @@ def batch_run_documents():
        "534535767"
    ]
    special_doc_id_list = check_db_mapping_doc_id_list
-    # special_doc_id_list = ["481482392"]
+    special_doc_id_list = ["422760156"]
    pdf_folder = r"/data/emea_ar/pdf/"
    page_filter_ground_truth_file = (
        r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1162,7 +1162,7 @@ def batch_run_documents():
    output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
    re_run_extract_data = True
    re_run_mapping_data = True
-    force_save_total_data = True
+    force_save_total_data = False
    calculate_metrics = False
    extract_ways = ["text"]
--- a/playground.ipynb
+++ b/playground.ipynb
@ -615,18 +615,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_fund_name(fund_name: str, fund_feature: str):\n",
-    "    fund_name_split = fund_name.split(fund_feature)\n",
+    "        if not fund_name.endswith(fund_feature):\n",
-    "    if len(fund_name_split) > 1:\n",
+    "            return fund_name\n",
-    "        last_fund = fund_name_split[-1].strip()\n",
+    "        fund_feature = fund_feature + \" \"\n",
-    "        if len(last_fund) == 0:\n",
+    "        fund_name_split = fund_name.split(fund_feature)\n",
-    "            last_fund = fund_name_split[-2].strip()\n",
+    "        if len(fund_name_split) > 1:\n",
-    "        fund_name = f\"{last_fund} {fund_feature}\"\n",
+    "            last_fund = fund_name_split[-1].strip()\n",
-    "    return fund_name"
+    "            if len(last_fund) == 0:\n",
    "                last_fund = fund_name_split[-2].strip()\n",
    "            fund_name = f\"{last_fund} {fund_feature}\"\n",
    "        return fund_name"
   ]
  },
  {
@ -649,6 +652,35 @@
    "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'- Global Income Conservative Fund Fund '"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "get_fund_name(fund_name, \"Fund\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
--- a/utils/biz_utils.py
+++ b/utils/biz_utils.py
@ -133,8 +133,10 @@ def get_most_similar_name(text: str,
            share_name = replace_abbrevation(share_name)
            text, share_name = replace_share_name_for_multilingual(text, share_name)
        if matching_type == "share" and share_name is None:
            text, share_name = replace_share_name_for_multilingual(text, None)
        text_splits = text.split()
        if len(text_splits) == 1:
            text = split_words_without_space(text)