dc-ml-emea-ar/core/auz_nz/string_similarity.py

import math
import re
from collections import Counter

from fuzzywuzzy import fuzz


WORD = re.compile(r"\w+")


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

def get_cosine_similarity(str1: str, str2: str):
    """
    Calculate the cosine similarity between two strings.
    """
    try:
        vec1 = text_to_vector(str1.lower())
        vec2 = text_to_vector(str2.lower())
        intersection = set(vec1.keys()) & set(vec2.keys())
        numerator = sum([vec1[x] * vec2[x] for x in intersection])
        sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
        sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator
    except Exception as e:
        # print("error: ",e)
        pass
        return 0.0

def get_ngrams(text, n):
    # Remove spaces and create a list of bigrams
    text = text.replace(" ", "")  # Remove spaces
    return {text[i : i + n] for i in range(len(text) - 1)}


def get_jaccard_similarity(str1: str, str2: str) -> float:
    """
    Calculate the jaccard similarity between two strings.
    """
    try:
        # Generate bigrams for each string
        str1 = str1.lower()
        str2 = str2.lower()
        set1 = set(get_ngrams(str1, 2))  # Bigrams for str1
        set2 = set(get_ngrams(str2, 2))  # Bigrams for str2
        # Calculate intersection and union
        intersection = len(set1.intersection(set2))
        union = len(set1.union(set2))
        # Compute Jaccard similarity
        return intersection / union if union != 0 else 0.0
    except Exception as e:
        # print("error: ",e)
        pass
        return 0.0

def get_levenshtien_distance_score(str1: str, str2: str) -> float:
    """
    Calculate the levenshtein distance score between two strings.
    """
    try:
        str1 = str1.lower()
        str2 = str2.lower()
        similarity_score = fuzz.ratio(str1, str2)
        try:
            return similarity_score / 100
        except ZeroDivisionError as e:
            return 0.0
    except Exception as e:
        # print("error: ",e)
        pass
        return 0.0