import re def add_slash_to_text_as_regex(text: str): if text is None or len(text) == 0: return text special_char_iter = re.finditer("\W", text) for special_iter in special_char_iter: if len(special_iter.group().strip()) == 0: continue replace = r"\{0}".format(special_iter.group()) if replace not in text: text = re.sub(replace, replace, text) text = re.sub(r"\s+", r"\\s+", text) return text def clean_text(text: str) -> str: text = text.lower() # update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space text = re.sub(r"\\u[0-9a-z]{4}", ' ', text) text = re.sub(r"( ){2,}", ' ', text.strip()) return text