Source code for pathbench.f0_range_evaluator
import parselmouth
from pathbench.evaluator import ReferenceFreeEvaluator, ReferenceFreeSpeakerEvaluator
from typing import List, Optional, Tuple
import numpy as np
import librosa
[docs]
class StdPitchEvaluator(ReferenceFreeEvaluator):
"""An evaluator that computes the standard deviation of the pitch in semitones."""
[docs]
def score(
self,
utterance_id: str,
audio_path: str,
start_time: float = 0.0,
end_time: float = -1.0,
) -> Optional[float]:
try:
duration = end_time - start_time if end_time != -1.0 else None
y, sr = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration)
return self._score_audio(y, sr)
except Exception as e:
print(f"Error processing audio file {audio_path}: {e}")
return None
def _score_audio(self, audio: np.ndarray, fs: int) -> Optional[float]:
try:
if audio is None or len(audio) == 0:
return 0.0
sound = parselmouth.Sound(audio, sampling_frequency=fs)
pitch = sound.to_pitch()
pitch_values = pitch.selected_array['frequency']
# Filter out unvoiced frames
pitch_values = pitch_values[pitch_values > 0]
if len(pitch_values) < 2:
print("Warning: Not enough voiced frames to calculate std of pitch. Returning 0.")
return 0.0
pitch_semitones = 39.86 * np.log10(pitch_values)
return np.std(pitch_semitones)
except Exception as e:
print(f"Error computing StdPitch: {e}")
return None
[docs]
class F0RangeEvaluator(ReferenceFreeSpeakerEvaluator):
"""An evaluator that computes the F0 range for a speaker."""
[docs]
def score(
self,
audio_files: List[Tuple[str, float, float]],
) -> Optional[float]:
audios = []
for (audio_path, start_time, end_time) in audio_files:
try:
duration = end_time - start_time if end_time != -1 else None
y, sr = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration)
if y is not None and len(y) > 0:
audios.append((y, sr))
except Exception as e:
print(f"Error loading audio file {audio_path}: {e}")
if not audios:
return None
return self._score_audio_list(audios)
def _score_audio_list(
self, audios: List[Tuple[np.ndarray, int]]
) -> Optional[float]:
f0_values = []
for y, sr in audios:
try:
if y is None or len(y) == 0:
continue
sound = parselmouth.Sound(y, sampling_frequency=sr)
pitch = sound.to_pitch()
f0 = pitch.selected_array['frequency']
f0_values.extend(f0[f0 > 0])
except Exception as e:
print(f"Error processing audio: {e}")
if not f0_values:
print("No valid F0 values found. Returning 0.")
return 0
f0_range = np.max(f0_values) - np.min(f0_values)
return f0_range