diff --git a/core/data_extraction.py b/core/data_extraction.py index 380affe..e711f02 100644 --- a/core/data_extraction.py +++ b/core/data_extraction.py @@ -531,14 +531,16 @@ class DataExtraction: modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip() if len(modified_fund_name.split()) > 1: fund_name = modified_fund_name + fund_name = self.get_fund_name(fund_name, "Fund") + fund_name = self.get_fund_name(fund_name, "Bond") + remove_list = ["Market Specific Equity Sub-Funds", - "International and Regional Equity Sub-Funds", + "International and Regional Equity Sub-Funds", "Equity Sub-Funds"] for remove_item in remove_list: if fund_name.startswith(remove_item): fund_name = fund_name.replace(remove_item, "").strip() - fund_name = self.get_fund_name(fund_name, "Fund") - fund_name = self.get_fund_name(fund_name, "Bond") + data["fund name"] = fund_name # Clean fund name end @@ -606,6 +608,8 @@ class DataExtraction: def get_fund_name(self, fund_name: str, fund_feature: str): if not fund_name.endswith(fund_feature): return fund_name + # to avoid split funds to fund s + fund_feature = fund_feature + " " fund_name_split = fund_name.split(fund_feature) if len(fund_name_split) > 1: last_fund = fund_name_split[-1].strip() diff --git a/instructions/data_extraction_prompts_config.json b/instructions/data_extraction_prompts_config.json index 8da7b39..b341774 100644 --- a/instructions/data_extraction_prompts_config.json +++ b/instructions/data_extraction_prompts_config.json @@ -68,6 +68,10 @@ "performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56" }, "special_rule": { + "tor": [ + "If there are multiple TOR reported names, here is the priority rules:", + "- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"." + ], "ter": [ "If there are multiple TER value columns, here is the priority rules:", "- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".", diff --git a/main.py b/main.py index c605065..3e2e095 100644 --- a/main.py +++ b/main.py @@ -1151,7 +1151,7 @@ def batch_run_documents(): "534535767" ] special_doc_id_list = check_db_mapping_doc_id_list - # special_doc_id_list = ["481482392"] + special_doc_id_list = ["422760156"] pdf_folder = r"/data/emea_ar/pdf/" page_filter_ground_truth_file = ( r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" @@ -1162,7 +1162,7 @@ def batch_run_documents(): output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" re_run_extract_data = True re_run_mapping_data = True - force_save_total_data = True + force_save_total_data = False calculate_metrics = False extract_ways = ["text"] diff --git a/playground.ipynb b/playground.ipynb index 883c4ad..4260150 100644 --- a/playground.ipynb +++ b/playground.ipynb @@ -615,18 +615,21 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def get_fund_name(fund_name: str, fund_feature: str):\n", - " fund_name_split = fund_name.split(fund_feature)\n", - " if len(fund_name_split) > 1:\n", - " last_fund = fund_name_split[-1].strip()\n", - " if len(last_fund) == 0:\n", - " last_fund = fund_name_split[-2].strip()\n", - " fund_name = f\"{last_fund} {fund_feature}\"\n", - " return fund_name" + " if not fund_name.endswith(fund_feature):\n", + " return fund_name\n", + " fund_feature = fund_feature + \" \"\n", + " fund_name_split = fund_name.split(fund_feature)\n", + " if len(fund_name_split) > 1:\n", + " last_fund = fund_name_split[-1].strip()\n", + " if len(last_fund) == 0:\n", + " last_fund = fund_name_split[-2].strip()\n", + " fund_name = f\"{last_fund} {fund_feature}\"\n", + " return fund_name" ] }, { @@ -649,6 +652,35 @@ "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'- Global Income Conservative Fund Fund '" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_fund_name(fund_name, \"Fund\")" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/utils/biz_utils.py b/utils/biz_utils.py index 38688b2..d45e9d6 100644 --- a/utils/biz_utils.py +++ b/utils/biz_utils.py @@ -133,8 +133,10 @@ def get_most_similar_name(text: str, share_name = replace_abbrevation(share_name) text, share_name = replace_share_name_for_multilingual(text, share_name) - + if matching_type == "share" and share_name is None: + text, share_name = replace_share_name_for_multilingual(text, None) + text_splits = text.split() if len(text_splits) == 1: text = split_words_without_space(text)