77 lines
2.2 KiB
Python
77 lines
2.2 KiB
Python
import math
|
|
import re
|
|
from collections import Counter
|
|
|
|
from fuzzywuzzy import fuzz
|
|
|
|
|
|
WORD = re.compile(r"\w+")
|
|
|
|
|
|
def text_to_vector(text):
|
|
words = WORD.findall(text)
|
|
return Counter(words)
|
|
|
|
def get_cosine_similarity(str1: str, str2: str):
|
|
"""
|
|
Calculate the cosine similarity between two strings.
|
|
"""
|
|
try:
|
|
vec1 = text_to_vector(str1.lower())
|
|
vec2 = text_to_vector(str2.lower())
|
|
intersection = set(vec1.keys()) & set(vec2.keys())
|
|
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
|
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
|
|
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
|
|
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
|
if not denominator:
|
|
return 0.0
|
|
else:
|
|
return float(numerator) / denominator
|
|
except Exception as e:
|
|
# print("error: ",e)
|
|
pass
|
|
return 0.0
|
|
|
|
def get_ngrams(text, n):
|
|
# Remove spaces and create a list of bigrams
|
|
text = text.replace(" ", "") # Remove spaces
|
|
return {text[i : i + n] for i in range(len(text) - 1)}
|
|
|
|
|
|
def get_jaccard_similarity(str1: str, str2: str) -> float:
|
|
"""
|
|
Calculate the jaccard similarity between two strings.
|
|
"""
|
|
try:
|
|
# Generate bigrams for each string
|
|
str1 = str1.lower()
|
|
str2 = str2.lower()
|
|
set1 = set(get_ngrams(str1, 2)) # Bigrams for str1
|
|
set2 = set(get_ngrams(str2, 2)) # Bigrams for str2
|
|
# Calculate intersection and union
|
|
intersection = len(set1.intersection(set2))
|
|
union = len(set1.union(set2))
|
|
# Compute Jaccard similarity
|
|
return intersection / union if union != 0 else 0.0
|
|
except Exception as e:
|
|
# print("error: ",e)
|
|
pass
|
|
return 0.0
|
|
|
|
def get_levenshtien_distance_score(str1: str, str2: str) -> float:
|
|
"""
|
|
Calculate the levenshtein distance score between two strings.
|
|
"""
|
|
try:
|
|
str1 = str1.lower()
|
|
str2 = str2.lower()
|
|
similarity_score = fuzz.ratio(str1, str2)
|
|
try:
|
|
return similarity_score / 100
|
|
except ZeroDivisionError as e:
|
|
return 0.0
|
|
except Exception as e:
|
|
# print("error: ",e)
|
|
pass
|
|
return 0.0 |