refactor code
This commit is contained in:
parent
b18c48efeb
commit
8bd6008425
4
main.py
4
main.py
|
|
@ -446,7 +446,7 @@ def get_metrics(
|
||||||
ground_truth_sheet_name=ground_truth_sheet_name,
|
ground_truth_sheet_name=ground_truth_sheet_name,
|
||||||
output_folder=output_folder,
|
output_folder=output_folder,
|
||||||
)
|
)
|
||||||
missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=True)
|
missing_error_list, metrics_list, metrics_file = metrics.get_metrics(strict_model=False)
|
||||||
return missing_error_list, metrics_list, metrics_file
|
return missing_error_list, metrics_list, metrics_file
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -725,7 +725,7 @@ if __name__ == "__main__":
|
||||||
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
output_mapping_child_folder = r"/data/emea_ar/output/mapping_data/docs/"
|
||||||
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
output_mapping_total_folder = r"/data/emea_ar/output/mapping_data/total/"
|
||||||
re_run_extract_data = False
|
re_run_extract_data = False
|
||||||
re_run_mapping_data = False
|
re_run_mapping_data = True
|
||||||
force_save_total_data = True
|
force_save_total_data = True
|
||||||
|
|
||||||
extract_ways = ["text"]
|
extract_ways = ["text"]
|
||||||
|
|
|
||||||
|
|
@ -51,6 +51,9 @@ total_currency_list = [
|
||||||
|
|
||||||
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
share_features_full_name = ['Accumulation', 'Income', 'Distribution', 'Investor', 'Institutional', 'Admin', 'Advantage']
|
||||||
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
|
share_features_abbrevation = ['Acc', 'Inc', 'Dist', 'Div', 'Inv', 'Inst', 'Adm', 'Adv']
|
||||||
|
lower_pre_fix_fund_share = ['fund', "funds", 'portfolio',
|
||||||
|
'bond', 'bonds', 'class',
|
||||||
|
'classes', 'share', 'shares']
|
||||||
|
|
||||||
|
|
||||||
def add_slash_to_text_as_regex(text: str):
|
def add_slash_to_text_as_regex(text: str):
|
||||||
|
|
@ -145,26 +148,17 @@ def get_most_similar_name(text: str,
|
||||||
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
|
text, share_name, copy_name_list = update_for_currency(text, share_name, copy_name_list)
|
||||||
|
|
||||||
text = ' '.join([split for split in text.split()
|
text = ' '.join([split for split in text.split()
|
||||||
if split.lower() not in ['fund', "funds", 'portfolio',
|
if split.lower() not in lower_pre_fix_fund_share])
|
||||||
'bond', 'bonds',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']])
|
|
||||||
if share_name is not None:
|
if share_name is not None:
|
||||||
share_name = ' '.join([split for split in share_name.split()
|
share_name = ' '.join([split for split in share_name.split()
|
||||||
if split.lower() not in ['fund', "funds", 'portfolio',
|
if split.lower() not in lower_pre_fix_fund_share])
|
||||||
'bond', 'bonds',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']])
|
|
||||||
|
|
||||||
copy_share_name_list = get_share_part_list(copy_name_list)
|
copy_share_name_list = get_share_part_list(copy_name_list)
|
||||||
for i in range(len(copy_name_list)):
|
for i in range(len(copy_name_list)):
|
||||||
temp_splits = copy_name_list[i].split()
|
temp_splits = copy_name_list[i].split()
|
||||||
copy_name_list[i] = ' '.join([split for split in temp_splits
|
copy_name_list[i] = ' '.join([split for split in temp_splits
|
||||||
if remove_special_characters(split).lower()
|
if remove_special_characters(split).lower()
|
||||||
not in ['fund', "funds", 'portfolio',
|
not in lower_pre_fix_fund_share])
|
||||||
'bond', 'bonds',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']])
|
|
||||||
text_currency = None
|
text_currency = None
|
||||||
text_feature = None
|
text_feature = None
|
||||||
text_share_short_name_list = None
|
text_share_short_name_list = None
|
||||||
|
|
@ -192,10 +186,15 @@ def get_most_similar_name(text: str,
|
||||||
"share_currency": text_currency
|
"share_currency": text_currency
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
if share_name is not None and len(share_name.strip()) > 0:
|
||||||
text_share_short_name_list.sort()
|
text_share_short_name_list = get_share_short_name_from_text(share_name)
|
||||||
text_feature = get_share_feature_from_text(share_name)
|
text_share_short_name_list.sort()
|
||||||
text_currency = get_currency_from_text(share_name)
|
text_feature = get_share_feature_from_text(share_name)
|
||||||
|
text_currency = get_currency_from_text(share_name)
|
||||||
|
else:
|
||||||
|
text_share_short_name_list = get_share_short_name_from_text(text)
|
||||||
|
text_feature = get_share_feature_from_text(text)
|
||||||
|
text_currency = get_currency_from_text(text)
|
||||||
|
|
||||||
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
# logger.info(f"Source text: {text}, candidate names count: {len(copy_name_list)}")
|
||||||
same_max_similarity_name_list = []
|
same_max_similarity_name_list = []
|
||||||
|
|
@ -308,10 +307,7 @@ def get_share_part_list(text_list: list):
|
||||||
share_part_text = text.strip()
|
share_part_text = text.strip()
|
||||||
share_part_text = ' '.join([split for split in share_part_text.split()
|
share_part_text = ' '.join([split for split in share_part_text.split()
|
||||||
if remove_special_characters(split).lower()
|
if remove_special_characters(split).lower()
|
||||||
not in ['fund', "funds", 'portfolio',
|
not in lower_pre_fix_fund_share])
|
||||||
'bond', 'bonds',
|
|
||||||
'class', 'classes',
|
|
||||||
'share', 'shares']])
|
|
||||||
share_part_list.append(share_part_text)
|
share_part_list.append(share_part_text)
|
||||||
return share_part_list
|
return share_part_list
|
||||||
|
|
||||||
|
|
@ -367,7 +363,6 @@ def get_currency_from_text(text: str):
|
||||||
if text is None or len(text.strip()) == 0:
|
if text is None or len(text.strip()) == 0:
|
||||||
return None
|
return None
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
text = text.lower()
|
|
||||||
text_split = text.split()
|
text_split = text.split()
|
||||||
count = 0
|
count = 0
|
||||||
for split in text_split[::-1]:
|
for split in text_split[::-1]:
|
||||||
|
|
@ -380,22 +375,20 @@ def get_currency_from_text(text: str):
|
||||||
|
|
||||||
|
|
||||||
def update_for_currency(text: str, share_name: str, compare_list: list):
|
def update_for_currency(text: str, share_name: str, compare_list: list):
|
||||||
text_split = text.split()
|
currency_in_text = get_currency_from_text(text)
|
||||||
with_currency = False
|
with_currency = False
|
||||||
for split in text_split:
|
if currency_in_text is not None:
|
||||||
if split.upper() in total_currency_list:
|
with_currency = True
|
||||||
with_currency = True
|
|
||||||
break
|
|
||||||
|
|
||||||
with_currency_list = []
|
with_currency_list = []
|
||||||
without_currency_list = []
|
without_currency_list = []
|
||||||
for index, compare in enumerate(compare_list):
|
for index, compare in enumerate(compare_list):
|
||||||
compare_split = compare.split()
|
# compare_split = compare.split()
|
||||||
with_currency_compare = False
|
with_currency_compare = False
|
||||||
for split in compare_split:
|
currecy_in_compare = get_currency_from_text(compare)
|
||||||
if split.upper() in total_currency_list:
|
if currecy_in_compare is not None:
|
||||||
with_currency_compare = True
|
with_currency_compare = True
|
||||||
break
|
|
||||||
if with_currency_compare:
|
if with_currency_compare:
|
||||||
with_currency_list.append(index)
|
with_currency_list.append(index)
|
||||||
else:
|
else:
|
||||||
|
|
@ -493,15 +486,8 @@ def remove_common_word(text_list: list):
|
||||||
text = text.lower()
|
text = text.lower()
|
||||||
text = remove_special_characters(text)
|
text = remove_special_characters(text)
|
||||||
text_splits = text.split()
|
text_splits = text.split()
|
||||||
while 'fund' in text_splits:
|
text = ' '.join([split for split in text_splits
|
||||||
text_splits.remove('fund')
|
if split.lower() not in lower_pre_fix_fund_share])
|
||||||
while 'portfolio' in text_splits:
|
|
||||||
text_splits.remove('portfolio')
|
|
||||||
while 'share' in text_splits:
|
|
||||||
text_splits.remove('share')
|
|
||||||
while 'class' in text_splits:
|
|
||||||
text_splits.remove('class')
|
|
||||||
text = ' '.join(text_splits)
|
|
||||||
new_text_list.append(text)
|
new_text_list.append(text)
|
||||||
# remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
|
# remove common word in new_text_list, such as 'Blackrock Global Fund' and 'Blackrock Growth Fund', then 'Blackrock', 'Fund' are common words
|
||||||
# the result is ['Global', 'Growth']
|
# the result is ['Global', 'Growth']
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue