Source code for pathbench.speech_rate

from typing import Optional
import librosa
from pathbench.evaluator import ReferenceFreeEvaluator, ReferenceTxtEvaluator
import math
import parselmouth
from parselmouth.praat import call
import numpy as np


[docs] class WpmEvaluator(ReferenceTxtEvaluator): """An evaluator that scores based on the speech rate (words per minute)."""
[docs] def score( self, utterance_id: str, audio_path: str, transcription: str, language: str, start_time: float = 0.0, end_time: float = -1.0, ) -> Optional[float]: """ Returns the speech rate in words per minute (WPM). """ try: duration_s = 0 duration = end_time - start_time if end_time != -1.0 else None audio, fs = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration) if audio is None or fs is None or len(audio) == 0: duration_s = 0 else: duration_s = len(audio) / fs if duration_s <= 0: return 0.0 # Count words in transcription word_count = len(transcription.split()) if word_count == 0: return 0.0 # Calculate WPM wpm = (word_count / duration_s) * 60 return wpm except Exception as e: print(f"Error processing file {audio_path}: {e}") return None
########################################################################### # # # Praat Script Syllable Nuclei # # Copyright (C) 2008 Nivja de Jong and Ton Wempe # # # # This program is free software: you can redistribute it and/or modify # # it under the terms of the GNU General Public License as published by # # the Free Software Foundation, either version 3 of the License, or # # (at your option) any later version. # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # # GNU General Public License for more details. # # # # You should have received a copy of the GNU General Public License # # along with this program. If not, see http://www.gnu.org/licenses/ # # # ########################################################################### # # modified 2010.09.17 by Hugo Quené, Ingrid Persoon, & Nivja de Jong # Overview of changes: # + change threshold-calculator: rather than using median, use the almost maximum # minus 25dB. (25 dB is in line with the standard setting to detect silence # in the "To TextGrid (silences)" function. # Almost maximum (.99 quantile) is used rather than maximum to avoid using # irrelevant non-speech sound-bursts. # + add silence-information to calculate articulation rate and ASD (average syllable # duration. # NB: speech rate = number of syllables / total time # articulation rate = number of syllables / phonation time # + remove max number of syllable nuclei # + refer to objects by unique identifier, not by name # + keep track of all created intermediate objects, select these explicitly, # then Remove # + provide summary output in Info window # + do not save TextGrid-file but leave it in Object-window for inspection # (if requested in startup-form) # + allow Sound to have starting time different from zero # for Sound objects created with Extract (preserve times) # + programming of checking loop for mindip adjusted # in the orig version, precedingtime was not modified if the peak was rejected !! # var precedingtime and precedingint renamed to currenttime and currentint # # + bug fixed concerning summing total pause, feb 28th 2011 ########################################################################### # counts syllables of all sound utterances in a directory # NB unstressed syllables are sometimes overlooked # NB filter sounds that are quite noisy beforehand # NB use Silence threshold (dB) = -25 (or -20?) # NB use Minimum dip between peaks (dB) = between 2-4 (you can first try; # For clean and filtered: 4) # # # Translated to Python in 2019 by David Feinberg # I changed all the variable names so they are human readable
[docs] class PraatSpeechRateEvaluator(ReferenceFreeEvaluator): """ An evaluator that scores based on the speech rate (syllables per second) using a Python translation of a Praat script by de Jong and Wempe. """
[docs] def score( self, utterance_id: str, audio_path: str, start_time: float = 0.0, end_time: float = -1.0, ) -> Optional[float]: """ Returns the speech rate in syllables per second. """ try: duration = end_time - start_time if end_time != -1.0 else None audio, fs = librosa.load(audio_path, sr=None, mono=True, offset=start_time, duration=duration) except Exception as e: print(f"Error loading audio {audio_path} with PraatSpeechRateEvaluator: {e}") return None return self._score_audio(audio, fs)
def _score_audio(self, audio: np.ndarray, fs: int) -> Optional[float]: try: silencedb = -25 mindip = 2 minpause = 0.3 sound = parselmouth.Sound(audio, sampling_frequency=fs) originaldur = sound.get_total_duration() if originaldur == 0: return 0.0 intensity = sound.to_intensity(50) min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic") max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic") max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99) threshold = max_99_intensity + silencedb threshold2 = max_intensity - max_99_intensity threshold3 = silencedb - threshold2 if threshold < min_intensity: threshold = min_intensity textgrid = call(intensity, "To TextGrid (silences)", threshold3, minpause, 0.1, "silent", "sounding") intensity_matrix = call(intensity, "Down to Matrix") sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1) point_process = call(sound_from_intensity_matrix, "To PointProcess (extrema)", "Left", "yes", "no", "Sinc70") numpeaks = call(point_process, "Get number of points") t = [call(point_process, "Get time from index", i + 1) for i in range(numpeaks)] timepeaks = [] peakcount = 0 intensities = [] for i in range(numpeaks): value = call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic") if value > threshold: peakcount += 1 intensities.append(value) timepeaks.append(t[i]) if peakcount < 2: return 0.0 validpeakcount = 0 currenttime = timepeaks[0] currentint = intensities[0] validtime = [] for p in range(peakcount - 1): following = p + 1 followingtime = timepeaks[p + 1] dip = call(intensity, "Get minimum", currenttime, timepeaks[p + 1], "None") diffint = abs(currentint - dip) if diffint > mindip: validpeakcount += 1 validtime.append(timepeaks[p]) currenttime = timepeaks[following] currentint = call(intensity, "Get value at time", timepeaks[following], "Cubic") pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450) voicedcount = 0 for time_index in range(validpeakcount): querytime = validtime[time_index] whichinterval = call(textgrid, "Get interval at time", 1, querytime) whichlabel = call(textgrid, "Get label of interval", 1, whichinterval) value = pitch.get_value_at_time(querytime) if not math.isnan(value): if whichlabel == "sounding": voicedcount += 1 speakingrate = voicedcount / originaldur return speakingrate except Exception as e: # Parselmouth can raise a generic "PraatError" print(f"Error processing audio with PraatSpeechRateEvaluator: {e}") return None