dc-ml-emea-ar/utils/similarity.py

166 lines
6.1 KiB
Python
Raw Permalink Normal View History

2024-08-19 14:52:13 +00:00
"""
@version: 0.1
@author: Blade He
@license: Morningstar
@contact: blade.he@morningstar.com
@site:
@software: PyCharm
@file: Similarity.py
@time: 2019/03/20
"""
from math import *
from decimal import Decimal
import math
import re
class Similarity:
""" Five similarity measures function """
def euclidean_distance(self, x, y):
""" return euclidean distance between two lists """
return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y)))
def manhattan_distance(self, x, y):
""" return manhattan distance between two lists """
return sum(abs(a - b) for a, b in zip(x, y))
def minkowski_distance(self, x, y, p_value):
""" return minkowski distance between two lists """
return self.nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value)
def nth_root(self, value, n_root):
""" returns the n_root of an value """
root_value = 1 / float(n_root)
return round(Decimal(value) ** Decimal(root_value), 3)
def cosine_similarity(self, x, y):
""" return cosine similarity between two lists """
numerator = sum(a * b for a, b in zip(x, y))
denominator = self.square_rooted(x) * self.square_rooted(y)
return round(numerator / float(denominator), 3)
def square_rooted(self, x):
""" return 3 rounded square rooted value """
return round(sqrt(sum([a * a for a in x])), 3)
def jaccard_similarity(self, x: list, y: list):
""" returns the jaccard similarity between two lists """
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
union_cardinality = len(set.union(*[set(x), set(y)]))
if union_cardinality == 0:
return 0
return intersection_cardinality / float(union_cardinality)
def y_in_x_similarity(self, x: list, y: list):
""" returns the jaccard similarity between two lists """
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
len_y = len(set(y))
if len_y == 0:
return 0
return intersection_cardinality / float(len_y)
def compare_text_in_text_list_similarity(self, text: str, compare_text_list: list):
if text is None or len(text) == 0:
return 0
if compare_text_list is None or len(compare_text_list) == 0:
return 0
# remove specical case for text
text = text.lower()
# Fix issue for matching fund feeder
# It's the case for the following text:
# Raw fund name: Schroders Capital UK Real Estate Fund Feeder Trust
# Fund name list in database:
# Schroder UK Real Estate Fund Feeder Trust
# Schroders Capital UK Real Estate Fund
# The matching should be Schroder UK Real Estate Fund Feeder Trust.
# But somehow, the matching is Schroders Capital UK Real Estate Fund,
# it's incorrect.
if "feeder" in text.split():
need_tranform = False
for compare in compare_text_list:
if "feeder" in compare.lower().split():
need_tranform = True
break
if need_tranform:
temp_max_similarity = 0
temp_max_similarity_text = ""
for compare in compare_text_list:
compare = compare.lower()
if "feeder" in compare.split():
similarity = self.y_in_x_similarity(text.split(), compare.split())
if similarity > temp_max_similarity:
temp_max_similarity = similarity
temp_max_similarity_text = compare
if temp_max_similarity > 0:
text = temp_max_similarity_text
text = re.sub(r'\W', ' ', text)
text = re.sub(r'\s+', ' ', text)
text_split = list(set([word for word in text.split()
if word.lower() not in ["name", "fund", "funds"]]))
if len(text_split) == 0:
return 0, ""
max_similarity = 0
max_similarity_text = ""
max_similarity_text_split = []
for comapare_text in compare_text_list:
updated_comapare_text = comapare_text.lower()
updated_comapare_text = re.sub(r'\W', ' ', updated_comapare_text)
updated_comapare_text = re.sub(r'\s+', ' ', updated_comapare_text)
comapare_text_split = list(set([word for word in updated_comapare_text.split()
if word.lower() not in ["name", "fund", "funds"]]))
if len(comapare_text_split) == 0:
continue
similarity = self.y_in_x_similarity(text_split, comapare_text_split)
if similarity > 0 and similarity == max_similarity:
if len(comapare_text_split) > len(max_similarity_text_split):
max_similarity_text = comapare_text
max_similarity_text_split = comapare_text_split
if similarity > max_similarity:
max_similarity = similarity
max_similarity_text = comapare_text
max_similarity_text_split = comapare_text_split
return max_similarity, max_similarity_text
def edit_distance_similarity(self, left: str, right: str):
m, n = len(left) + 1, len(right) + 1
# create a matrix (m*n)
matrix = [[0] * n for i in range(m)]
matrix[0][0] = 0
for i in range(1, m):
matrix[i][0] = matrix[i - 1][0] + 1
for j in range(1, n):
matrix[0][j] = matrix[0][j - 1] + 1
# for i in range(m):
# print(matrix[i])
#
# print()
"********************"
for i in range(1, m):
for j in range(1, n):
if left[i - 1] == right[j - 1]:
cost = 0
else:
cost = 1
matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
# for i in range(m):
# print(matrix[i])
distance = matrix[m - 1][n - 1]
return 1 - distance / max(len(left), len(right))