1. Set TOR reported name priority

2. Optimize investment mapping logic
This commit is contained in:
Blade He 2024-12-06 09:54:43 -06:00
parent 95c386911c
commit a25991e2bb
5 changed files with 56 additions and 14 deletions

View File

@ -531,14 +531,16 @@ class DataExtraction:
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip() modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
if len(modified_fund_name.split()) > 1: if len(modified_fund_name.split()) > 1:
fund_name = modified_fund_name fund_name = modified_fund_name
fund_name = self.get_fund_name(fund_name, "Fund")
fund_name = self.get_fund_name(fund_name, "Bond")
remove_list = ["Market Specific Equity Sub-Funds", remove_list = ["Market Specific Equity Sub-Funds",
"International and Regional Equity Sub-Funds", "International and Regional Equity Sub-Funds",
"Equity Sub-Funds"] "Equity Sub-Funds"]
for remove_item in remove_list: for remove_item in remove_list:
if fund_name.startswith(remove_item): if fund_name.startswith(remove_item):
fund_name = fund_name.replace(remove_item, "").strip() fund_name = fund_name.replace(remove_item, "").strip()
fund_name = self.get_fund_name(fund_name, "Fund")
fund_name = self.get_fund_name(fund_name, "Bond")
data["fund name"] = fund_name data["fund name"] = fund_name
# Clean fund name end # Clean fund name end
@ -606,6 +608,8 @@ class DataExtraction:
def get_fund_name(self, fund_name: str, fund_feature: str): def get_fund_name(self, fund_name: str, fund_feature: str):
if not fund_name.endswith(fund_feature): if not fund_name.endswith(fund_feature):
return fund_name return fund_name
# to avoid split funds to fund s
fund_feature = fund_feature + " "
fund_name_split = fund_name.split(fund_feature) fund_name_split = fund_name.split(fund_feature)
if len(fund_name_split) > 1: if len(fund_name_split) > 1:
last_fund = fund_name_split[-1].strip() last_fund = fund_name_split[-1].strip()

View File

@ -68,6 +68,10 @@
"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56" "performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56"
}, },
"special_rule": { "special_rule": {
"tor": [
"If there are multiple TOR reported names, here is the priority rules:",
"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"."
],
"ter": [ "ter": [
"If there are multiple TER value columns, here is the priority rules:", "If there are multiple TER value columns, here is the priority rules:",
"- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".", "- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".",

View File

@ -1151,7 +1151,7 @@ def batch_run_documents():
"534535767" "534535767"
] ]
special_doc_id_list = check_db_mapping_doc_id_list special_doc_id_list = check_db_mapping_doc_id_list
# special_doc_id_list = ["481482392"] special_doc_id_list = ["422760156"]
pdf_folder = r"/data/emea_ar/pdf/" pdf_folder = r"/data/emea_ar/pdf/"
page_filter_ground_truth_file = ( page_filter_ground_truth_file = (
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx" r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
@ -1162,7 +1162,7 @@ def batch_run_documents():
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/" output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
re_run_extract_data = True re_run_extract_data = True
re_run_mapping_data = True re_run_mapping_data = True
force_save_total_data = True force_save_total_data = False
calculate_metrics = False calculate_metrics = False
extract_ways = ["text"] extract_ways = ["text"]

View File

@ -615,11 +615,14 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": 3,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"def get_fund_name(fund_name: str, fund_feature: str):\n", "def get_fund_name(fund_name: str, fund_feature: str):\n",
" if not fund_name.endswith(fund_feature):\n",
" return fund_name\n",
" fund_feature = fund_feature + \" \"\n",
" fund_name_split = fund_name.split(fund_feature)\n", " fund_name_split = fund_name.split(fund_feature)\n",
" if len(fund_name_split) > 1:\n", " if len(fund_name_split) > 1:\n",
" last_fund = fund_name_split[-1].strip()\n", " last_fund = fund_name_split[-1].strip()\n",
@ -649,6 +652,35 @@
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")" "get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
] ]
}, },
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'- Global Income Conservative Fund Fund '"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_fund_name(fund_name, \"Fund\")"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,

View File

@ -134,6 +134,8 @@ def get_most_similar_name(text: str,
text, share_name = replace_share_name_for_multilingual(text, share_name) text, share_name = replace_share_name_for_multilingual(text, share_name)
if matching_type == "share" and share_name is None:
text, share_name = replace_share_name_for_multilingual(text, None)
text_splits = text.split() text_splits = text.split()
if len(text_splits) == 1: if len(text_splits) == 1: