Source code for pathbench.p_estoi_evaluator
from typing import Optional, List
import numpy as np
import librosa
from pathbench.reference_evaluator import ReferenceEvaluator, STOI
from pathbench.string_clean import clean_text
from pathbench.vad import FATrimmer
[docs]
class ForcedAlignmentPESTOIEvaluator(ReferenceEvaluator):
"""An evaluator that uses P-ESTOI to compute a score after trimming silence using forced alignment."""
def __init__(self, model_id: str = "facebook/wav2vec2-xlsr-53-espeak-cv-ft", **kwargs):
super().__init__(**kwargs)
self.trimmer = FATrimmer(model_id)
[docs]
def score(
self,
utterance_id: str,
audio_path: str,
transcription: str,
language: str,
reference_audios: List[tuple[str, float, float]],
start_time: float,
end_time: float,
**kwargs,
) -> Optional[float]:
"""
Computes the P-ESTOI score after trimming silence.
"""
use_segments = start_time != 0.0 or end_time != -1.0
trimmed_audio = None
if use_segments:
duration = end_time - start_time if end_time != -1 else None
try:
trimmed_audio, _ = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration, dtype=np.float64)
except Exception as e:
print(f"Error reading audio file {audio_path}: {e}")
trimmed_audio = None
else:
trimmed_data = self.trimmer.trim(audio_path, transcription, language, start_time, end_time)
if trimmed_data:
trimmed_audio, _ = trimmed_data
# Check if test_audio is full silence
if trimmed_audio is None or np.all(trimmed_audio == 0):
print(f"Warning: Test audio {audio_path} is silent or could not be trimmed. Returning P-ESTOI score of 0.0.")
return 0.0
reference_audios_data = []
if reference_audios:
for ref_path, ref_start, ref_end in reference_audios:
ref_use_segments = ref_start != 0.0 or ref_end != -1.0
ref_audio = None
if ref_use_segments:
duration = ref_end - ref_start if ref_end != -1 else None
try:
ref_audio, _ = librosa.load(ref_path, sr=16000, offset=ref_start, duration=duration, dtype=np.float64)
except Exception as e:
print(f"Error reading audio file {ref_path}: {e}")
ref_audio = None
else:
trimmed_ref_data = self.trimmer.trim(ref_path, transcription, language, ref_start, ref_end)
if trimmed_ref_data:
ref_audio, _ = trimmed_ref_data
if ref_audio is not None:
reference_audios_data.append(ref_audio)
if not reference_audios_data:
print(f"Warning: No valid reference audios found for {utterance_id}. Cannot compute P-ESTOI.")
return None
stoi_object = STOI(
normalization_method='RMS',
centroid_ind=0,
frame_deletion=True,
reference_words=reference_audios_data,
test_words=[trimmed_audio],
**self.stoi_kwargs
)
return stoi_object.estoi_val[0]