Source code for pathbench.string_clean

import re
import string

# Chinese punctuation characters not covered by string.punctuation (ASCII-only).
_CJK_PUNCTUATION = (
    "\u3000\u3001\u3002\uff01\uff0c\uff0e\uff1a\uff1b\uff1f"  # 。、！，．：；？
    "\u300a\u300b\u300c\u300d\u300e\u300f\u3010\u3011"          # 《》「」『』【】
    "\u2018\u2019\u201c\u201d"                                   # ''""
    "\uff08\uff09"                                               # （）
    "\u2014\u2026"                                               # —…
)

_ALL_PUNCTUATION = string.punctuation + _CJK_PUNCTUATION


[docs]
def clean_text(text: str) -> str:
    """
    A function to clean text by:
    1. Lowercasing
    2. Removing punctuation (ASCII and CJK)
    3. Removing multiple spaces
    4. Stripping whitespace
    """
    text = text.lower()
    text = text.translate(str.maketrans("", "", _ALL_PUNCTUATION))
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text



from functools import lru_cache



[docs]
@lru_cache(maxsize=10000)
def cached_phonemize(text: str, language: str) -> str:
    """Cached phonemization via espeak. Pure function: same input -> same output."""
    from phonemizer.phonemize import phonemize
    from phonemizer.separator import Separator
    separator = Separator(phone=" ", word="|")
    return phonemize(
        text, language=language, backend="espeak",
        strip=True, preserve_punctuation=False, separator=separator
    )