1. Set TOR reported name priority
2. Optimize investment mapping logic
This commit is contained in:
parent
95c386911c
commit
a25991e2bb
|
|
@ -531,14 +531,16 @@ class DataExtraction:
|
||||||
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
|
modified_fund_name = fund_name.replace(previous_page_last_fund, "").strip()
|
||||||
if len(modified_fund_name.split()) > 1:
|
if len(modified_fund_name.split()) > 1:
|
||||||
fund_name = modified_fund_name
|
fund_name = modified_fund_name
|
||||||
|
fund_name = self.get_fund_name(fund_name, "Fund")
|
||||||
|
fund_name = self.get_fund_name(fund_name, "Bond")
|
||||||
|
|
||||||
remove_list = ["Market Specific Equity Sub-Funds",
|
remove_list = ["Market Specific Equity Sub-Funds",
|
||||||
"International and Regional Equity Sub-Funds",
|
"International and Regional Equity Sub-Funds",
|
||||||
"Equity Sub-Funds"]
|
"Equity Sub-Funds"]
|
||||||
for remove_item in remove_list:
|
for remove_item in remove_list:
|
||||||
if fund_name.startswith(remove_item):
|
if fund_name.startswith(remove_item):
|
||||||
fund_name = fund_name.replace(remove_item, "").strip()
|
fund_name = fund_name.replace(remove_item, "").strip()
|
||||||
fund_name = self.get_fund_name(fund_name, "Fund")
|
|
||||||
fund_name = self.get_fund_name(fund_name, "Bond")
|
|
||||||
data["fund name"] = fund_name
|
data["fund name"] = fund_name
|
||||||
# Clean fund name end
|
# Clean fund name end
|
||||||
|
|
||||||
|
|
@ -606,6 +608,8 @@ class DataExtraction:
|
||||||
def get_fund_name(self, fund_name: str, fund_feature: str):
|
def get_fund_name(self, fund_name: str, fund_feature: str):
|
||||||
if not fund_name.endswith(fund_feature):
|
if not fund_name.endswith(fund_feature):
|
||||||
return fund_name
|
return fund_name
|
||||||
|
# to avoid split funds to fund s
|
||||||
|
fund_feature = fund_feature + " "
|
||||||
fund_name_split = fund_name.split(fund_feature)
|
fund_name_split = fund_name.split(fund_feature)
|
||||||
if len(fund_name_split) > 1:
|
if len(fund_name_split) > 1:
|
||||||
last_fund = fund_name_split[-1].strip()
|
last_fund = fund_name_split[-1].strip()
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,10 @@
|
||||||
"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56"
|
"performance_fee": "Performance fees is belong to percentage number, the value should be less than 100.\nPerformance fees could be negative number, e.g. -0.56"
|
||||||
},
|
},
|
||||||
"special_rule": {
|
"special_rule": {
|
||||||
|
"tor": [
|
||||||
|
"If there are multiple TOR reported names, here is the priority rules:",
|
||||||
|
"- With \"Taux de rotation - Omloopsnelheid\" and \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\", pick up the values from \"Taux de rotation corrigé - Gecorrigeerde omloopsnelheid\"."
|
||||||
|
],
|
||||||
"ter": [
|
"ter": [
|
||||||
"If there are multiple TER value columns, here is the priority rules:",
|
"If there are multiple TER value columns, here is the priority rules:",
|
||||||
"- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".",
|
"- With \"TER with Performance Fee\" and \"Fund TER\", pick up the values from \"TER with Performance Fee\".",
|
||||||
|
|
|
||||||
4
main.py
4
main.py
|
|
@ -1151,7 +1151,7 @@ def batch_run_documents():
|
||||||
"534535767"
|
"534535767"
|
||||||
]
|
]
|
||||||
special_doc_id_list = check_db_mapping_doc_id_list
|
special_doc_id_list = check_db_mapping_doc_id_list
|
||||||
# special_doc_id_list = ["481482392"]
|
special_doc_id_list = ["422760156"]
|
||||||
pdf_folder = r"/data/emea_ar/pdf/"
|
pdf_folder = r"/data/emea_ar/pdf/"
|
||||||
page_filter_ground_truth_file = (
|
page_filter_ground_truth_file = (
|
||||||
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
r"/data/emea_ar/ground_truth/page_filter/datapoint_page_info_88_documents.xlsx"
|
||||||
|
|
@ -1162,7 +1162,7 @@ def batch_run_documents():
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = True
|
re_run_extract_data = True
|
||||||
re_run_mapping_data = True
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = False
|
||||||
calculate_metrics = False
|
calculate_metrics = False
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -615,18 +615,21 @@
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def get_fund_name(fund_name: str, fund_feature: str):\n",
|
"def get_fund_name(fund_name: str, fund_feature: str):\n",
|
||||||
" fund_name_split = fund_name.split(fund_feature)\n",
|
" if not fund_name.endswith(fund_feature):\n",
|
||||||
" if len(fund_name_split) > 1:\n",
|
" return fund_name\n",
|
||||||
" last_fund = fund_name_split[-1].strip()\n",
|
" fund_feature = fund_feature + \" \"\n",
|
||||||
" if len(last_fund) == 0:\n",
|
" fund_name_split = fund_name.split(fund_feature)\n",
|
||||||
" last_fund = fund_name_split[-2].strip()\n",
|
" if len(fund_name_split) > 1:\n",
|
||||||
" fund_name = f\"{last_fund} {fund_feature}\"\n",
|
" last_fund = fund_name_split[-1].strip()\n",
|
||||||
" return fund_name"
|
" if len(last_fund) == 0:\n",
|
||||||
|
" last_fund = fund_name_split[-2].strip()\n",
|
||||||
|
" fund_name = f\"{last_fund} {fund_feature}\"\n",
|
||||||
|
" return fund_name"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|
@ -649,6 +652,35 @@
|
||||||
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
|
"get_fund_name(\"A Fund B Fund C Fund\", \"Fund\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"fund_name = \"JPMorgan Investment Fund - Global Income Conservative Fund\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'- Global Income Conservative Fund Fund '"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"get_fund_name(fund_name, \"Fund\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": null,
|
"execution_count": null,
|
||||||
|
|
|
||||||
|
|
@ -133,8 +133,10 @@ def get_most_similar_name(text: str,
|
||||||
share_name = replace_abbrevation(share_name)
|
share_name = replace_abbrevation(share_name)
|
||||||
|
|
||||||
text, share_name = replace_share_name_for_multilingual(text, share_name)
|
text, share_name = replace_share_name_for_multilingual(text, share_name)
|
||||||
|
|
||||||
|
|
||||||
|
if matching_type == "share" and share_name is None:
|
||||||
|
text, share_name = replace_share_name_for_multilingual(text, None)
|
||||||
|
|
||||||
text_splits = text.split()
|
text_splits = text.split()
|
||||||
if len(text_splits) == 1:
|
if len(text_splits) == 1:
|
||||||
text = split_words_without_space(text)
|
text = split_words_without_space(text)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue