166 lines
6.1 KiB
Python
166 lines
6.1 KiB
Python
|
|
"""
|
||
|
|
@version: 0.1
|
||
|
|
@author: Blade He
|
||
|
|
@license: Morningstar
|
||
|
|
@contact: blade.he@morningstar.com
|
||
|
|
@site:
|
||
|
|
@software: PyCharm
|
||
|
|
@file: Similarity.py
|
||
|
|
@time: 2019/03/20
|
||
|
|
"""
|
||
|
|
from math import *
|
||
|
|
from decimal import Decimal
|
||
|
|
import math
|
||
|
|
import re
|
||
|
|
|
||
|
|
|
||
|
|
class Similarity:
|
||
|
|
""" Five similarity measures function """
|
||
|
|
def euclidean_distance(self, x, y):
|
||
|
|
""" return euclidean distance between two lists """
|
||
|
|
|
||
|
|
return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y)))
|
||
|
|
|
||
|
|
def manhattan_distance(self, x, y):
|
||
|
|
""" return manhattan distance between two lists """
|
||
|
|
|
||
|
|
return sum(abs(a - b) for a, b in zip(x, y))
|
||
|
|
|
||
|
|
def minkowski_distance(self, x, y, p_value):
|
||
|
|
""" return minkowski distance between two lists """
|
||
|
|
|
||
|
|
return self.nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value)
|
||
|
|
|
||
|
|
def nth_root(self, value, n_root):
|
||
|
|
""" returns the n_root of an value """
|
||
|
|
|
||
|
|
root_value = 1 / float(n_root)
|
||
|
|
return round(Decimal(value) ** Decimal(root_value), 3)
|
||
|
|
|
||
|
|
def cosine_similarity(self, x, y):
|
||
|
|
""" return cosine similarity between two lists """
|
||
|
|
|
||
|
|
numerator = sum(a * b for a, b in zip(x, y))
|
||
|
|
denominator = self.square_rooted(x) * self.square_rooted(y)
|
||
|
|
return round(numerator / float(denominator), 3)
|
||
|
|
|
||
|
|
def square_rooted(self, x):
|
||
|
|
""" return 3 rounded square rooted value """
|
||
|
|
|
||
|
|
return round(sqrt(sum([a * a for a in x])), 3)
|
||
|
|
|
||
|
|
def jaccard_similarity(self, x: list, y: list):
|
||
|
|
""" returns the jaccard similarity between two lists """
|
||
|
|
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
|
||
|
|
union_cardinality = len(set.union(*[set(x), set(y)]))
|
||
|
|
if union_cardinality == 0:
|
||
|
|
return 0
|
||
|
|
return intersection_cardinality / float(union_cardinality)
|
||
|
|
|
||
|
|
|
||
|
|
def y_in_x_similarity(self, x: list, y: list):
|
||
|
|
""" returns the jaccard similarity between two lists """
|
||
|
|
intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
|
||
|
|
len_y = len(set(y))
|
||
|
|
if len_y == 0:
|
||
|
|
return 0
|
||
|
|
return intersection_cardinality / float(len_y)
|
||
|
|
|
||
|
|
def compare_text_in_text_list_similarity(self, text: str, compare_text_list: list):
|
||
|
|
if text is None or len(text) == 0:
|
||
|
|
return 0
|
||
|
|
if compare_text_list is None or len(compare_text_list) == 0:
|
||
|
|
return 0
|
||
|
|
# remove specical case for text
|
||
|
|
text = text.lower()
|
||
|
|
|
||
|
|
# Fix issue for matching fund feeder
|
||
|
|
# It's the case for the following text:
|
||
|
|
# Raw fund name: Schroders Capital UK Real Estate Fund Feeder Trust
|
||
|
|
# Fund name list in database:
|
||
|
|
# Schroder UK Real Estate Fund Feeder Trust
|
||
|
|
# Schroders Capital UK Real Estate Fund
|
||
|
|
# The matching should be Schroder UK Real Estate Fund Feeder Trust.
|
||
|
|
# But somehow, the matching is Schroders Capital UK Real Estate Fund,
|
||
|
|
# it's incorrect.
|
||
|
|
if "feeder" in text.split():
|
||
|
|
need_tranform = False
|
||
|
|
for compare in compare_text_list:
|
||
|
|
if "feeder" in compare.lower().split():
|
||
|
|
need_tranform = True
|
||
|
|
break
|
||
|
|
if need_tranform:
|
||
|
|
temp_max_similarity = 0
|
||
|
|
temp_max_similarity_text = ""
|
||
|
|
for compare in compare_text_list:
|
||
|
|
compare = compare.lower()
|
||
|
|
if "feeder" in compare.split():
|
||
|
|
similarity = self.y_in_x_similarity(text.split(), compare.split())
|
||
|
|
if similarity > temp_max_similarity:
|
||
|
|
temp_max_similarity = similarity
|
||
|
|
temp_max_similarity_text = compare
|
||
|
|
if temp_max_similarity > 0:
|
||
|
|
text = temp_max_similarity_text
|
||
|
|
|
||
|
|
text = re.sub(r'\W', ' ', text)
|
||
|
|
text = re.sub(r'\s+', ' ', text)
|
||
|
|
text_split = list(set([word for word in text.split()
|
||
|
|
if word.lower() not in ["name", "fund", "funds"]]))
|
||
|
|
if len(text_split) == 0:
|
||
|
|
return 0, ""
|
||
|
|
max_similarity = 0
|
||
|
|
max_similarity_text = ""
|
||
|
|
max_similarity_text_split = []
|
||
|
|
for comapare_text in compare_text_list:
|
||
|
|
updated_comapare_text = comapare_text.lower()
|
||
|
|
updated_comapare_text = re.sub(r'\W', ' ', updated_comapare_text)
|
||
|
|
updated_comapare_text = re.sub(r'\s+', ' ', updated_comapare_text)
|
||
|
|
comapare_text_split = list(set([word for word in updated_comapare_text.split()
|
||
|
|
if word.lower() not in ["name", "fund", "funds"]]))
|
||
|
|
if len(comapare_text_split) == 0:
|
||
|
|
continue
|
||
|
|
similarity = self.y_in_x_similarity(text_split, comapare_text_split)
|
||
|
|
if similarity > 0 and similarity == max_similarity:
|
||
|
|
if len(comapare_text_split) > len(max_similarity_text_split):
|
||
|
|
max_similarity_text = comapare_text
|
||
|
|
max_similarity_text_split = comapare_text_split
|
||
|
|
if similarity > max_similarity:
|
||
|
|
max_similarity = similarity
|
||
|
|
max_similarity_text = comapare_text
|
||
|
|
max_similarity_text_split = comapare_text_split
|
||
|
|
|
||
|
|
return max_similarity, max_similarity_text
|
||
|
|
|
||
|
|
|
||
|
|
def edit_distance_similarity(self, left: str, right: str):
|
||
|
|
m, n = len(left) + 1, len(right) + 1
|
||
|
|
# create a matrix (m*n)
|
||
|
|
matrix = [[0] * n for i in range(m)]
|
||
|
|
matrix[0][0] = 0
|
||
|
|
for i in range(1, m):
|
||
|
|
matrix[i][0] = matrix[i - 1][0] + 1
|
||
|
|
|
||
|
|
for j in range(1, n):
|
||
|
|
matrix[0][j] = matrix[0][j - 1] + 1
|
||
|
|
|
||
|
|
# for i in range(m):
|
||
|
|
# print(matrix[i])
|
||
|
|
#
|
||
|
|
# print()
|
||
|
|
"********************"
|
||
|
|
for i in range(1, m):
|
||
|
|
for j in range(1, n):
|
||
|
|
if left[i - 1] == right[j - 1]:
|
||
|
|
cost = 0
|
||
|
|
else:
|
||
|
|
cost = 1
|
||
|
|
|
||
|
|
matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost)
|
||
|
|
|
||
|
|
# for i in range(m):
|
||
|
|
# print(matrix[i])
|
||
|
|
|
||
|
|
distance = matrix[m - 1][n - 1]
|
||
|
|
return 1 - distance / max(len(left), len(right))
|
||
|
|
|