Source code for pathbench.speech_rate

from typing import Optional
import librosa
from pathbench.evaluator import ReferenceFreeEvaluator, ReferenceTxtEvaluator
import math
import parselmouth
from parselmouth.praat import call
import numpy as np



[docs]
class WpmEvaluator(ReferenceTxtEvaluator):
    """An evaluator that scores based on the speech rate (words per minute)."""


[docs]
    def score(
        self,
        utterance_id: str,
        audio_path: str,
        transcription: str,
        language: str,
        start_time: float = 0.0,
        end_time: float = -1.0,
    ) -> Optional[float]:
        """
        Returns the speech rate in words per minute (WPM).
        """
        try:
            duration_s = 0
            duration = end_time - start_time if end_time != -1.0 else None
            audio, fs = librosa.load(audio_path, sr=16000, offset=start_time, duration=duration)

            if audio is None or fs is None or len(audio) == 0:
                duration_s = 0
            else:
                duration_s = len(audio) / fs

            if duration_s <= 0:
                return 0.0

            # Count words in transcription
            word_count = len(transcription.split())

            if word_count == 0:
                return 0.0

            # Calculate WPM
            wpm = (word_count / duration_s) * 60
            return wpm
        except Exception as e:
            print(f"Error processing file {audio_path}: {e}")
            return None



###########################################################################
#                                                                         #
#  Praat Script Syllable Nuclei                                           #
#  Copyright (C) 2008  Nivja de Jong and Ton Wempe                        #
#                                                                         #
#    This program is free software: you can redistribute it and/or modify #
#    it under the terms of the GNU General Public License as published by #
#    the Free Software Foundation, either version 3 of the License, or    #
#    (at your option) any later version.                                  #
#                                                                         #
#    This program is distributed in the hope that it will be useful,      #
#    but WITHOUT ANY WARRANTY; without even the implied warranty of       #
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        #
#    GNU General Public License for more details.                         #
#                                                                         #
#    You should have received a copy of the GNU General Public License    #
#    along with this program.  If not, see http://www.gnu.org/licenses/   #
#                                                                         #
###########################################################################
#
# modified 2010.09.17 by Hugo Quené, Ingrid Persoon, & Nivja de Jong
# Overview of changes:
# + change threshold-calculator: rather than using median, use the almost maximum
#     minus 25dB. (25 dB is in line with the standard setting to detect silence
#     in the "To TextGrid (silences)" function.
#     Almost maximum (.99 quantile) is used rather than maximum to avoid using
#     irrelevant non-speech sound-bursts.
# + add silence-information to calculate articulation rate and ASD (average syllable
#     duration.
#     NB: speech rate = number of syllables / total time
#         articulation rate = number of syllables / phonation time
# + remove max number of syllable nuclei
# + refer to objects by unique identifier, not by name
# + keep track of all created intermediate objects, select these explicitly,
#     then Remove
# + provide summary output in Info window
# + do not save TextGrid-file but leave it in Object-window for inspection
#     (if requested in startup-form)
# + allow Sound to have starting time different from zero
#      for Sound objects created with Extract (preserve times)
# + programming of checking loop for mindip adjusted
#      in the orig version, precedingtime was not modified if the peak was rejected !!
#      var precedingtime and precedingint renamed to currenttime and currentint
#
# + bug fixed concerning summing total pause, feb 28th 2011
###########################################################################


# counts syllables of all sound utterances in a directory
# NB unstressed syllables are sometimes overlooked
# NB filter sounds that are quite noisy beforehand
# NB use Silence threshold (dB) = -25 (or -20?)
# NB use Minimum dip between peaks (dB) = between 2-4 (you can first try;
#                                                      For clean and filtered: 4)
#
#
# Translated to Python in 2019 by David Feinberg
# I changed all the variable names so they are human readable


[docs]
class PraatSpeechRateEvaluator(ReferenceFreeEvaluator):
    """
    An evaluator that scores based on the speech rate (syllables per second)
    using a Python translation of a Praat script by de Jong and Wempe.
    """


[docs]
    def score(
        self,
        utterance_id: str,
        audio_path: str,
        start_time: float = 0.0,
        end_time: float = -1.0,
    ) -> Optional[float]:
        """
        Returns the speech rate in syllables per second.
        """
        try:
            duration = end_time - start_time if end_time != -1.0 else None
            audio, fs = librosa.load(audio_path, sr=None, mono=True, offset=start_time, duration=duration)
        except Exception as e:
            print(f"Error loading audio {audio_path} with PraatSpeechRateEvaluator: {e}")
            return None
        return self._score_audio(audio, fs)


    def _score_audio(self, audio: np.ndarray, fs: int) -> Optional[float]:
        try:
            silencedb = -25
            mindip = 2
            minpause = 0.3

            sound = parselmouth.Sound(audio, sampling_frequency=fs)

            originaldur = sound.get_total_duration()
            if originaldur == 0:
                return 0.0

            intensity = sound.to_intensity(50)
            min_intensity = call(intensity, "Get minimum", 0, 0, "Parabolic")
            max_intensity = call(intensity, "Get maximum", 0, 0, "Parabolic")

            max_99_intensity = call(intensity, "Get quantile", 0, 0, 0.99)

            threshold = max_99_intensity + silencedb
            threshold2 = max_intensity - max_99_intensity
            threshold3 = silencedb - threshold2
            if threshold < min_intensity:
                threshold = min_intensity

            textgrid = call(intensity, "To TextGrid (silences)", threshold3, minpause, 0.1, "silent", "sounding")

            intensity_matrix = call(intensity, "Down to Matrix")
            sound_from_intensity_matrix = call(intensity_matrix, "To Sound (slice)", 1)

            point_process = call(sound_from_intensity_matrix, "To PointProcess (extrema)", "Left", "yes", "no", "Sinc70")
            numpeaks = call(point_process, "Get number of points")
            t = [call(point_process, "Get time from index", i + 1) for i in range(numpeaks)]

            timepeaks = []
            peakcount = 0
            intensities = []
            for i in range(numpeaks):
                value = call(sound_from_intensity_matrix, "Get value at time", t[i], "Cubic")
                if value > threshold:
                    peakcount += 1
                    intensities.append(value)
                    timepeaks.append(t[i])

            if peakcount < 2:
                return 0.0

            validpeakcount = 0
            currenttime = timepeaks[0]
            currentint = intensities[0]
            validtime = []

            for p in range(peakcount - 1):
                following = p + 1
                followingtime = timepeaks[p + 1]
                dip = call(intensity, "Get minimum", currenttime, timepeaks[p + 1], "None")
                diffint = abs(currentint - dip)
                if diffint > mindip:
                    validpeakcount += 1
                    validtime.append(timepeaks[p])
                currenttime = timepeaks[following]
                currentint = call(intensity, "Get value at time", timepeaks[following], "Cubic")

            pitch = sound.to_pitch_ac(0.02, 30, 4, False, 0.03, 0.25, 0.01, 0.35, 0.25, 450)
            voicedcount = 0
            for time_index in range(validpeakcount):
                querytime = validtime[time_index]
                whichinterval = call(textgrid, "Get interval at time", 1, querytime)
                whichlabel = call(textgrid, "Get label of interval", 1, whichinterval)
                value = pitch.get_value_at_time(querytime)
                if not math.isnan(value):
                    if whichlabel == "sounding":
                        voicedcount += 1

            speakingrate = voicedcount / originaldur
            return speakingrate
        except Exception as e:
            # Parselmouth can raise a generic "PraatError"
            print(f"Error processing audio with PraatSpeechRateEvaluator: {e}")
            return None