dc-ml-emea-ar/utils/biz_utils.py

22 lines
732 B
Python
Raw Normal View History

import re
def add_slash_to_text_as_regex(text: str):
if text is None or len(text) == 0:
return text
special_char_iter = re.finditer("\W", text)
for special_iter in special_char_iter:
if len(special_iter.group().strip()) == 0:
continue
replace = r"\{0}".format(special_iter.group())
if replace not in text:
text = re.sub(replace, replace, text)
text = re.sub(r"\s+", r"\\s+", text)
2024-08-28 15:21:26 +00:00
return text
def clean_text(text: str) -> str:
text = text.lower()
# update the specical character which begin with \u, e.g \u2004 or \u00a0 to be space
text = re.sub(r"\\u[0-9a-z]{4}", ' ', text)
text = re.sub(r"( ){2,}", ' ', text.strip())
return text