""" @version: 0.1 @author: Blade He @license: Morningstar @contact: blade.he@morningstar.com @site: @software: PyCharm @file: Similarity.py @time: 2019/03/20 """ from math import * from decimal import Decimal import math import re class Similarity: """ Five similarity measures function """ def euclidean_distance(self, x, y): """ return euclidean distance between two lists """ return sqrt(sum(pow(a - b, 2) for a, b in zip(x, y))) def manhattan_distance(self, x, y): """ return manhattan distance between two lists """ return sum(abs(a - b) for a, b in zip(x, y)) def minkowski_distance(self, x, y, p_value): """ return minkowski distance between two lists """ return self.nth_root(sum(pow(abs(a - b), p_value) for a, b in zip(x, y)), p_value) def nth_root(self, value, n_root): """ returns the n_root of an value """ root_value = 1 / float(n_root) return round(Decimal(value) ** Decimal(root_value), 3) def cosine_similarity(self, x, y): """ return cosine similarity between two lists """ numerator = sum(a * b for a, b in zip(x, y)) denominator = self.square_rooted(x) * self.square_rooted(y) return round(numerator / float(denominator), 3) def square_rooted(self, x): """ return 3 rounded square rooted value """ return round(sqrt(sum([a * a for a in x])), 3) def jaccard_similarity(self, x: list, y: list): """ returns the jaccard similarity between two lists """ intersection_cardinality = len(set.intersection(*[set(x), set(y)])) union_cardinality = len(set.union(*[set(x), set(y)])) if union_cardinality == 0: return 0 return intersection_cardinality / float(union_cardinality) def y_in_x_similarity(self, x: list, y: list): """ returns the jaccard similarity between two lists """ intersection_cardinality = len(set.intersection(*[set(x), set(y)])) len_y = len(set(y)) if len_y == 0: return 0 return intersection_cardinality / float(len_y) def compare_text_in_text_list_similarity(self, text: str, compare_text_list: list): if text is None or len(text) == 0: return 0 if compare_text_list is None or len(compare_text_list) == 0: return 0 # remove specical case for text text = text.lower() # Fix issue for matching fund feeder # It's the case for the following text: # Raw fund name: Schroders Capital UK Real Estate Fund Feeder Trust # Fund name list in database: # Schroder UK Real Estate Fund Feeder Trust # Schroders Capital UK Real Estate Fund # The matching should be Schroder UK Real Estate Fund Feeder Trust. # But somehow, the matching is Schroders Capital UK Real Estate Fund, # it's incorrect. if "feeder" in text.split(): need_tranform = False for compare in compare_text_list: if "feeder" in compare.lower().split(): need_tranform = True break if need_tranform: temp_max_similarity = 0 temp_max_similarity_text = "" for compare in compare_text_list: compare = compare.lower() if "feeder" in compare.split(): similarity = self.y_in_x_similarity(text.split(), compare.split()) if similarity > temp_max_similarity: temp_max_similarity = similarity temp_max_similarity_text = compare if temp_max_similarity > 0: text = temp_max_similarity_text text = re.sub(r'\W', ' ', text) text = re.sub(r'\s+', ' ', text) text_split = list(set([word for word in text.split() if word.lower() not in ["name", "fund", "funds"]])) if len(text_split) == 0: return 0, "" max_similarity = 0 max_similarity_text = "" max_similarity_text_split = [] for comapare_text in compare_text_list: updated_comapare_text = comapare_text.lower() updated_comapare_text = re.sub(r'\W', ' ', updated_comapare_text) updated_comapare_text = re.sub(r'\s+', ' ', updated_comapare_text) comapare_text_split = list(set([word for word in updated_comapare_text.split() if word.lower() not in ["name", "fund", "funds"]])) if len(comapare_text_split) == 0: continue similarity = self.y_in_x_similarity(text_split, comapare_text_split) if similarity > 0 and similarity == max_similarity: if len(comapare_text_split) > len(max_similarity_text_split): max_similarity_text = comapare_text max_similarity_text_split = comapare_text_split if similarity > max_similarity: max_similarity = similarity max_similarity_text = comapare_text max_similarity_text_split = comapare_text_split return max_similarity, max_similarity_text def edit_distance_similarity(self, left: str, right: str): m, n = len(left) + 1, len(right) + 1 # create a matrix (m*n) matrix = [[0] * n for i in range(m)] matrix[0][0] = 0 for i in range(1, m): matrix[i][0] = matrix[i - 1][0] + 1 for j in range(1, n): matrix[0][j] = matrix[0][j - 1] + 1 # for i in range(m): # print(matrix[i]) # # print() "********************" for i in range(1, m): for j in range(1, n): if left[i - 1] == right[j - 1]: cost = 0 else: cost = 1 matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + cost) # for i in range(m): # print(matrix[i]) distance = matrix[m - 1][n - 1] return 1 - distance / max(len(left), len(right))