Source code for pathbench.cpp_evaluator

import math
from typing import Optional

import numpy as np
import scipy.signal
import librosa
import parselmouth
from parselmouth.praat import call

from pathbench.evaluator import ReferenceFreeEvaluator

eps = np.finfo(float).eps


[docs]
def cpp_func(x, fs, normOpt, double_log=False):
    """
    Computes cepstral peak prominence for a given signal

    Parameters
    -----------
    x: ndarray
        The audio signal
    fs: integer
        The sampling frequency
    normOpt: string
        'line', 'mean' or 'nonorm' for selecting normalisation type
    double_log: bool
        If True, uses the legacy double-log formulation (incorrect but kept for comparison).
        If False (default), uses the standard CPP formulation.

    Returns
    -----------
    cpp: ndarray
        The CPP with time values
    """
    # Settings
    frame_length = int(np.round(0.04*fs))
    frame_shift = int(np.round(0.01*fs))
    half_len = int(np.round(frame_length/2))
    x_len = len(x)
    frame_len = half_len*2 + 1
    NFFT = 2**(math.ceil(np.log(frame_len)/np.log(2)))

    # Allowed quefrency range (pitch 60-333.3 Hz)
    pitch_range = [60, 333.3]
    quef_lim = [int(np.round(fs/pitch_range[1])),
                int(np.round(fs/pitch_range[0]))]
    quef_seq = range(quef_lim[0]-1, quef_lim[1])

    # Time samples
    time_samples = np.array(
        range(frame_length+1, x_len-frame_length+1, frame_shift))
    N = len(time_samples)
    if N == 0:
        return np.array([]), np.array([])
    frame_start = time_samples-half_len
    frame_stop = time_samples+half_len

    # High-pass filtering (pre-emphasis)
    HPfilt_b = [1, -0.97]
    x = scipy.signal.lfilter(HPfilt_b, 1, x)

    # Frame matrix
    frameMat = np.zeros([NFFT, N])
    for n in range(0, N):
        frameMat[0: frame_len, n] = x[frame_start[n]-1:frame_stop[n]]

    # Hanning window
    def hanning(N):
        x = np.array([i/(N+1) for i in range(1, int(np.ceil(N/2))+1)])
        w = 0.5-0.5*np.cos(2*np.pi*x)
        w_rev = w[::-1]
        return np.concatenate((w, w_rev[int((np.ceil(N % 2))):]))
    win = hanning(frame_len)
    winmat = np.tile(win, (N, 1)).transpose()
    frameMat = frameMat[0:frame_len, :]*winmat

    # Cepstrum computation
    SpecMat = np.abs(np.fft.fft(frameMat, axis=0))
    with np.errstate(divide='ignore'):
        SpecdB = 20*np.log10(SpecMat + eps)

    if double_log:
        # Legacy (incorrect) formulation: extra log of cepstrum
        ceps = 20*np.log10(np.abs(np.fft.fft(SpecdB, axis=0)) + eps)
    else:
        # Standard CPP: cepstrum = FFT(log(spectrum))
        ceps = np.abs(np.fft.fft(SpecdB, axis=0))

    # Finding the peak in quefrency range
    ceps_lim = ceps[quef_seq, :]
    ceps_max = ceps_lim.max(axis=0)
    max_index = ceps_lim.argmax(axis=0)

    # Normalisation (regression line or mean)
    ceps_norm = np.zeros([N])
    if normOpt == 'line':
        for n in range(0, N):
            p = np.polyfit(quef_seq, ceps_lim[:, n], 1)
            ceps_norm[n] = np.polyval(p, quef_seq[max_index[n]])
    elif normOpt == 'mean':
        ceps_norm = np.mean(ceps_lim)

    cpp = ceps_max - ceps_norm

    return cpp, time_samples




[docs]
class CPPEvaluator(ReferenceFreeEvaluator):
    """Cepstral Peak Prominence (standard formulation). Reference-free."""

    def __init__(self, normOpt: str = 'line'):
        self.normOpt = normOpt
        self.double_log = False


[docs]
    def score(
        self,
        utterance_id: str,
        audio_path: str,
        start_time: float = 0.0,
        end_time: float = -1.0,
    ) -> Optional[float]:
        try:
            duration = end_time - start_time if end_time != -1.0 else None
            audio, fs = librosa.load(audio_path, sr=16000, mono=True,
                                     offset=start_time, duration=duration)
        except Exception as e:
            print(f"Error reading audio file {audio_path}: {e}")
            return None

        if audio is None or len(audio) == 0:
            print(f"Warning: Audio for {audio_path} is empty.")
            return None

        return self._score_audio(audio, fs)


    def _score_audio(self, audio: np.ndarray, fs: int) -> Optional[float]:
        cpp, _ = cpp_func(audio, fs, self.normOpt, double_log=self.double_log)
        if len(cpp) == 0:
            return None
        return float(np.mean(cpp))




[docs]
class CPPDoubleLogEvaluator(CPPEvaluator):
    """Legacy CPP evaluator using the double-log formulation (incorrect but kept for comparison).

    The standard CPP is: peak(FFT(log(spectrum))) - regression_line
    This version uses: peak(log(FFT(log(spectrum)))) - regression_line
    """

    def __init__(self, normOpt: str = 'line'):
        super().__init__(normOpt=normOpt)
        self.double_log = True




[docs]
class PraatCPPEvaluator(ReferenceFreeEvaluator):
    """CPP evaluator using Praat's built-in PowerCepstrogram implementation via parselmouth.

    This is the reference implementation used in clinical voice research.
    Uses Praat's "Get CPPS" command which computes smoothed Cepstral Peak Prominence
    following the methodology of Hillenbrand et al. (1994).

    Reference: https://www.fon.hum.uva.nl/praat/manual/PowerCepstrogram__Get_CPPS___.html
    """

    def __init__(
        self,
        pitch_floor: float = 60.0,
        pitch_ceiling: float = 330.0,
        time_averaging_window: float = 0.02,
        quefrency_averaging_window: float = 0.0005,
    ):
        self.pitch_floor = pitch_floor
        self.pitch_ceiling = pitch_ceiling
        self.time_averaging_window = time_averaging_window
        self.quefrency_averaging_window = quefrency_averaging_window


[docs]
    def score(
        self,
        utterance_id: str,
        audio_path: str,
        start_time: float = 0.0,
        end_time: float = -1.0,
    ) -> Optional[float]:
        use_segment = start_time != 0.0 or end_time != -1.0
        try:
            if use_segment:
                duration = end_time - start_time if end_time != -1.0 else None
                audio, fs = librosa.load(audio_path, sr=16000, mono=True,
                                         offset=start_time, duration=duration)
                sound = parselmouth.Sound(audio, sampling_frequency=fs)
            else:
                sound = parselmouth.Sound(audio_path)
        except Exception as e:
            print(f"Error loading audio {audio_path}: {e}")
            return None

        if sound.n_samples < 400:
            print(f"Warning: Audio too short for {audio_path}.")
            return None

        return self._score_audio_from_sound(sound)


    def _score_audio(self, audio: np.ndarray, fs: int) -> Optional[float]:
        sound = parselmouth.Sound(audio, sampling_frequency=fs)
        if sound.n_samples < 400:
            return None
        return self._score_audio_from_sound(sound)

    def _score_audio_from_sound(self, sound) -> Optional[float]:
        try:
            power_cepstrogram = call(
                sound,
                "To PowerCepstrogram",
                self.pitch_floor,
                0.002,
                8000.0,
                50.0
            )
            cpps = call(
                power_cepstrogram,
                "Get CPPS",
                "yes",
                self.time_averaging_window,
                self.quefrency_averaging_window,
                self.pitch_floor,
                self.pitch_ceiling,
                0.05,
                "Parabolic",
                0.001,
                0.05,
                "Straight",
                "Robust slow"
            )
            return cpps
        except Exception as e:
            print(f"Error computing Praat CPP: {e}")
            return None