import math import re from collections import Counter from fuzzywuzzy import fuzz WORD = re.compile(r"\w+") def text_to_vector(text): words = WORD.findall(text) return Counter(words) def get_cosine_similarity(str1: str, str2: str): """ Calculate the cosine similarity between two strings. """ try: vec1 = text_to_vector(str1.lower()) vec2 = text_to_vector(str2.lower()) intersection = set(vec1.keys()) & set(vec2.keys()) numerator = sum([vec1[x] * vec2[x] for x in intersection]) sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())]) sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())]) denominator = math.sqrt(sum1) * math.sqrt(sum2) if not denominator: return 0.0 else: return float(numerator) / denominator except Exception as e: # print("error: ",e) pass return 0.0 def get_ngrams(text, n): # Remove spaces and create a list of bigrams text = text.replace(" ", "") # Remove spaces return {text[i : i + n] for i in range(len(text) - 1)} def get_jaccard_similarity(str1: str, str2: str) -> float: """ Calculate the jaccard similarity between two strings. """ try: # Generate bigrams for each string str1 = str1.lower() str2 = str2.lower() set1 = set(get_ngrams(str1, 2)) # Bigrams for str1 set2 = set(get_ngrams(str2, 2)) # Bigrams for str2 # Calculate intersection and union intersection = len(set1.intersection(set2)) union = len(set1.union(set2)) # Compute Jaccard similarity return intersection / union if union != 0 else 0.0 except Exception as e: # print("error: ",e) pass return 0.0 def get_levenshtien_distance_score(str1: str, str2: str) -> float: """ Calculate the levenshtein distance score between two strings. """ try: str1 = str1.lower() str2 = str2.lower() similarity_score = fuzz.ratio(str1, str2) try: return similarity_score / 100 except ZeroDivisionError as e: return 0.0 except Exception as e: # print("error: ",e) pass return 0.0