Source code for pathbench.string_clean
import re
import string
# Chinese punctuation characters not covered by string.punctuation (ASCII-only).
_CJK_PUNCTUATION = (
"\u3000\u3001\u3002\uff01\uff0c\uff0e\uff1a\uff1b\uff1f" # 。、!,.:;?
"\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011" # 《》「」『』【】
"\u2018\u2019\u201c\u201d" # ''""
"\uff08\uff09" # ()
"\u2014\u2026" # —…
)
_ALL_PUNCTUATION = string.punctuation + _CJK_PUNCTUATION
[docs]
def clean_text(text: str) -> str:
"""
A function to clean text by:
1. Lowercasing
2. Removing punctuation (ASCII and CJK)
3. Removing multiple spaces
4. Stripping whitespace
"""
text = text.lower()
text = text.translate(str.maketrans("", "", _ALL_PUNCTUATION))
text = re.sub(r"\s+", " ", text)
text = text.strip()
return text
from functools import lru_cache
[docs]
@lru_cache(maxsize=10000)
def cached_phonemize(text: str, language: str) -> str:
"""Cached phonemization via espeak. Pure function: same input -> same output."""
from phonemizer.phonemize import phonemize
from phonemizer.separator import Separator
separator = Separator(phone=" ", word="|")
return phonemize(
text, language=language, backend="espeak",
strip=True, preserve_punctuation=False, separator=separator
)