dc-ml-emea-ar/core/auz_nz/string_similarity.py

77 lines
2.2 KiB
Python

import math
import re
from collections import Counter
from fuzzywuzzy import fuzz
WORD = re.compile(r"\w+")
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
def get_cosine_similarity(str1: str, str2: str):
"""
Calculate the cosine similarity between two strings.
"""
try:
vec1 = text_to_vector(str1.lower())
vec2 = text_to_vector(str2.lower())
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
except Exception as e:
# print("error: ",e)
pass
return 0.0
def get_ngrams(text, n):
# Remove spaces and create a list of bigrams
text = text.replace(" ", "") # Remove spaces
return {text[i : i + n] for i in range(len(text) - 1)}
def get_jaccard_similarity(str1: str, str2: str) -> float:
"""
Calculate the jaccard similarity between two strings.
"""
try:
# Generate bigrams for each string
str1 = str1.lower()
str2 = str2.lower()
set1 = set(get_ngrams(str1, 2)) # Bigrams for str1
set2 = set(get_ngrams(str2, 2)) # Bigrams for str2
# Calculate intersection and union
intersection = len(set1.intersection(set2))
union = len(set1.union(set2))
# Compute Jaccard similarity
return intersection / union if union != 0 else 0.0
except Exception as e:
# print("error: ",e)
pass
return 0.0
def get_levenshtien_distance_score(str1: str, str2: str) -> float:
"""
Calculate the levenshtein distance score between two strings.
"""
try:
str1 = str1.lower()
str2 = str2.lower()
similarity_score = fuzz.ratio(str1, str2)
try:
return similarity_score / 100
except ZeroDivisionError as e:
return 0.0
except Exception as e:
# print("error: ",e)
pass
return 0.0