Source code for pathbench.vsa_evaluator

from typing import List, Optional, Tuple
import os
import matplotlib.pyplot as plt

import numpy as np
import parselmouth
from scipy.spatial import ConvexHull
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from pathbench.evaluator import LanguageAwareSpeakerEvaluator



[docs]
class VSAEvaluator(LanguageAwareSpeakerEvaluator):
    """
    An evaluator that computes the Vowel Space Area (VSA) for a speaker.

    The algorithm is based on:
    -

    The vowel formant data for initialization is from:
    - English: Hillenbrand, J., Getty, L. A., Clark, M. J., & Wheeler, K. (1995).
      Acoustic characteristics of American English vowels. JASA, 97(5), 3099-3111.
    - Dutch: Adank et al. We use souther Dutch formant values due to Belgian context.

    - Italian: Bertinetto, P. M. (?) "The sound pattern of Standard Italian, as compared
    with the varieties spoken in Florence, Milan and Rome."
    - Spanish: Bradlow: A comparative acoustic study of English and Spanish vowels

    - Mandarin: Yang: Vowel production by Mandarin speakers of English
    When contributing new values please include the reference for the paper and cross-check.
    A future method be perhaps using controls for formant initialisation.
    """

    def __init__(self, gender: Optional[str] = None, visualize: bool = False):
        """
        Initializes the VSA evaluator.
        Args:
            gender: The gender of the speaker ('m' or 'f'). If not provided, it will be
                    estimated from the pitch at score time.
            visualize: Whether to save vowel space plots to the 'figure/' directory.
        """
        self.gender = gender
        self.visualize = visualize

        # English (Hillenbrand 1995) - 12 vowels (Bence: I haven't checked this data myself)
        hillenbrand_female = np.array([
            [437, 2779], [533, 2479], [543, 2320], [691, 2056],
            [855, 1845], [912, 1448], [707, 1219], [537, 1069],
            [524, 1331], [463, 1180], [759, 1403], [548, 1650]
        ])
        hillenbrand_male = np.array([
            [370, 2299], [468, 2049], [479, 2019], [610, 1794],
            [738, 1661], [793, 1299], [624, 1089], [471, 939],
            [462, 1179], [395, 1029], [663, 1239], [490, 1469]
        ])

        # Italian - 7 vowels
        italian_male = np.array([
            [290,2310], [350, 2050], [490, 1950], [780, 1430], [550, 970],
            [390, 870], [320, 800]])
        italian_female = italian_male

        # Spanish (Bradlow) - 5 vowels
        # TODO: Actually no female results are reported in Bradlow and male
        # matches with female. This needs correction but I suspect initialisation
        # has low impact
        spanish_female = np.array([
            [286, 2147], [458, 1814], [638, 1353], [460, 1019], [322, 992]
        ])
        spanish_male = np.array([
            [290, 2250], [450, 1850], [700, 1300], [450, 950], [300, 850]
        ])

        # Dutch (Adank et al.) - 15 vowels
        dutch_female = np.array([
            [725, 1262], [868, 1640], [581, 1932], [436, 2420], [439, 1804],
            [455, 2115], [317, 2647], [475, 987], [321, 1019], [418, 968],
            [457, 1785], [337, 2077], [696, 1282], [670, 2159], [696, 1762]
        ])
        dutch_male = np.array([
            [555, 1066], [717, 1429], [475, 1616], [384, 1993], [374, 1539],
            [364, 1745], [278, 2179], [398, 850], [266, 978], [369, 862],
            [353, 1492], [265, 1825], [549, 1127], [545, 1779], [583, 1484]
        ])

        # Mandarin Chinese - 11 vowels
        mandarin_male = np.array([
            [328, 2206], [422, 2064], [368, 1028], [444,1111], [742, 1284],
            [412, 2046], [606, 1823], [693, 1800], [728,1368], [377, 1054],
            [633, 1094]
        ])
        mandarin_female = np.array([
            [340, 2641], [458, 2465], [403, 1172], [496, 1157], [906, 1429],
            [434, 2444], [762, 2078], [846, 1956], [807, 1534], [424, 1188],
            [752, 1243]
        ])

        self._vowel_tables = {
            'en': {'n_clusters': 12, 'female': hillenbrand_female, 'male': hillenbrand_male},
            'it': {'n_clusters': 7,  'female': italian_female,     'male': italian_male},
            'es': {'n_clusters': 5,  'female': spanish_female,     'male': spanish_male},
            'nl': {'n_clusters': 15, 'female': dutch_female,       'male': dutch_male},
            'zh':  {'n_clusters': 11, 'female': mandarin_female,   'male': mandarin_male},
            'cmn': {'n_clusters': 11, 'female': mandarin_female,   'male': mandarin_male},
        }

    def _score_audio_list(
        self,
        audios: List[Tuple[np.ndarray, int]],
        language: str,
        speaker_id: str = "unknown_speaker",
    ) -> Optional[float]:
        # Dataset.language uses phonemiser locale codes (e.g. 'en-us'); strip to base code.
        language = language.split('-')[0]
        if language not in self._vowel_tables:
            print(f"VSAEvaluator: no vowel table for language '{language}', skipping.")
            return None
        tables = self._vowel_tables[language]
        n_clusters = tables['n_clusters']

        all_formants = []

        gender = self.gender
        if gender is None:
            pitches = []
            for speech, sample_rate in audios:
                try:
                    sound = parselmouth.Sound(speech, sampling_frequency=sample_rate)
                    pitch = sound.to_pitch()
                    pitch_values = pitch.selected_array['frequency']
                    voiced_pitches = pitch_values[pitch_values > 0]
                    if len(voiced_pitches) > 0:
                        pitches.append(np.mean(voiced_pitches))
                except Exception as e:
                    print(f"Could not process audio for pitch estimation: {e}")

            if pitches:
                avg_pitch = np.mean(pitches)
                gender = 'f' if avg_pitch > 165 else 'm'
                print(f"Estimated gender as '{gender}' with average pitch {avg_pitch:.2f} Hz.")
            else:
                gender = 'f'  # Default to female if pitch cannot be determined
                print("Could not determine pitch, defaulting to female.")

        max_formant = 5500 if gender == 'f' else 5000
        init_clusters = tables['female'] if gender == 'f' else tables['male']

        for speech, sample_rate in audios:
            try:
                sound = parselmouth.Sound(speech, sampling_frequency=sample_rate)

                pitch = sound.to_pitch()
                formants = sound.to_formant_burg(
                    time_step=0.01, max_number_of_formants=5,
                    maximum_formant=max_formant, window_length=0.05,
                    pre_emphasis_from=50
                )

                num_frames = pitch.get_number_of_frames()
                for i in range(num_frames):
                    frame_time = pitch.get_time_from_frame_number(i + 1)
                    if pitch.get_value_in_frame(i + 1) > 0:  # Voiced frame
                        f1 = formants.get_value_at_time(1, frame_time)
                        f2 = formants.get_value_at_time(2, frame_time)
                        if not np.isnan(f1) and not np.isnan(f2):
                            all_formants.append([f1, f2])
            except Exception as e:
                print(f"Error processing audio: {e}")
                continue

        if not all_formants:
            print("Warning: No formants extracted. Cannot calculate VSA.")
            return 0.0

        Fp = np.array(all_formants)

        if Fp.shape[0] < 4:  # n_components for GMM
            print("Warning: Not enough formant points to build GMM. Cannot calculate VSA.")
            return 0.0

        # B. Filtering
        try:
            gmm = GaussianMixture(n_components=min(4, Fp.shape[0]), random_state=0).fit(Fp)
            log_likelihood = gmm.score_samples(Fp)
            mean_ll = np.mean(log_likelihood)
            std_ll = np.std(log_likelihood)
            threshold = mean_ll - 2 * std_ll

            Fp_filtered = Fp[log_likelihood >= threshold]
        except Exception as e:
            print(f"Error during GMM filtering: {e}")
            return 0.0

        if len(Fp_filtered) < n_clusters:
            print(
                f"Warning: Not enough data points ({len(Fp_filtered)}) after filtering "
                f"for clustering. Cannot calculate VSA."
            )
            return 0.0

        # C. Clustering
        try:
            kmeans = KMeans(
                n_clusters=n_clusters, init=init_clusters, n_init=1, random_state=0
            ).fit(Fp_filtered)
            Kp = kmeans.cluster_centers_
        except Exception as e:
            print(f"Error during KMeans clustering: {e}")
            return 0.0

        # D. Convex hull/area calculation
        try:
            hull = ConvexHull(Kp)
            # Known naming trap in scipy: in 2D, .volume is the enclosed area and
            # .area is the perimeter. hull.volume is intentionally correct here.
            vsa = hull.volume
        except Exception as e:
            print(f"Error calculating convex hull: {e}")
            return 0.0

        if self.visualize:
            fig_dir = "figure"
            os.makedirs(fig_dir, exist_ok=True)

            plt.figure(figsize=(8, 8))

            plt.scatter(Fp_filtered[:, 1], Fp_filtered[:, 0], alpha=0.2, label="Formants")
            plt.scatter(Kp[:, 1], Kp[:, 0], marker='x', s=100, c='r', label="Cluster Centers")

            for simplex in hull.simplices:
                plt.plot(Kp[simplex, 1], Kp[simplex, 0], 'g-')

            plt.xlabel("F2 (Hz)")
            plt.ylabel("F1 (Hz)")
            plt.title(f"Vowel Space Area for Speaker {speaker_id} (VSA: {vsa:.2f})")
            plt.legend()
            plt.gca().invert_xaxis()
            plt.gca().invert_yaxis()
            plt.xlim(3000, 700)
            plt.ylim(1200, 200)
            plt.grid(True)

            fig_path = os.path.join(fig_dir, f"{speaker_id}_vsa.png")
            plt.savefig(fig_path)
            plt.close()

        return vsa