Source code for pathbench.f0_range_evaluator

import parselmouth
from pathbench.evaluator import ReferenceFreeEvaluator, ReferenceFreeSpeakerEvaluator
from typing import List, Optional, Tuple
import numpy as np
import librosa



[docs]
class StdPitchEvaluator(ReferenceFreeEvaluator):
    """An evaluator that computes the standard deviation of the pitch in semitones."""


[docs]
    def score(
        self,
        utterance_id: str,
        audio_path: str,
        start_time: float = 0.0,
        end_time: float = -1.0,
    ) -> Optional[float]:
        try:
            duration = end_time - start_time if end_time != -1.0 else None
            y, sr = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration)
            return self._score_audio(y, sr)
        except Exception as e:
            print(f"Error processing audio file {audio_path}: {e}")
            return None


    def _score_audio(self, audio: np.ndarray, fs: int) -> Optional[float]:
        try:
            if audio is None or len(audio) == 0:
                return 0.0

            sound = parselmouth.Sound(audio, sampling_frequency=fs)
            pitch = sound.to_pitch()
            pitch_values = pitch.selected_array['frequency']

            # Filter out unvoiced frames
            pitch_values = pitch_values[pitch_values > 0]

            if len(pitch_values) < 2:
                print("Warning: Not enough voiced frames to calculate std of pitch. Returning 0.")
                return 0.0

            pitch_semitones = 39.86 * np.log10(pitch_values)
            return np.std(pitch_semitones)
        except Exception as e:
            print(f"Error computing StdPitch: {e}")
            return None




[docs]
class F0RangeEvaluator(ReferenceFreeSpeakerEvaluator):
    """An evaluator that computes the F0 range for a speaker."""


[docs]
    def score(
        self,
        audio_files: List[Tuple[str, float, float]],
    ) -> Optional[float]:
        audios = []
        for (audio_path, start_time, end_time) in audio_files:
            try:
                duration = end_time - start_time if end_time != -1 else None
                y, sr = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration)
                if y is not None and len(y) > 0:
                    audios.append((y, sr))
            except Exception as e:
                print(f"Error loading audio file {audio_path}: {e}")

        if not audios:
            return None
        return self._score_audio_list(audios)


    def _score_audio_list(
        self, audios: List[Tuple[np.ndarray, int]]
    ) -> Optional[float]:
        f0_values = []
        for y, sr in audios:
            try:
                if y is None or len(y) == 0:
                    continue
                sound = parselmouth.Sound(y, sampling_frequency=sr)
                pitch = sound.to_pitch()
                f0 = pitch.selected_array['frequency']
                f0_values.extend(f0[f0 > 0])
            except Exception as e:
                print(f"Error processing audio: {e}")

        if not f0_values:
            print("No valid F0 values found. Returning 0.")
            return 0

        f0_range = np.max(f0_values) - np.min(f0_values)
        return f0_range