@InProceedings{Zhang2026_1284,
author = {Tianyi Zhang and Peter Birkholz},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2026, Tagungsband der 37. Konferenz},
title = {Joint Estimation of Source and Filter Parameters for Speaker Adaptation in Articulatory Speech Synthesis},
year = {2026},
editor = {Günther Wirsching},
month = mar,
pages = {102--111},
publisher = {TUDpress, Dresden},
abstract = {Magnetic Resonance Imaging (MRI) data of the vocal tract were used to reproduce the anatomy and articulation of two new speakers for the articulatory speech synthesizer VocalTractLab. Due to the limited MRI resolution and the artificially sustained phonemes during the scans, the vocal tract shapes reconstructed from the MRI data usually did not fully correspond to the natural articulatory configurations. Therefore, we introduced a strategy for the joint optimization of the vocal tract model and vocal fold model parameters to match the synthesized vowels with their naturally produced counterparts. We used genetic algorithms (GA) and particle swarm optimization (PSO) to minimize the root mean squared error (RMSE) between the mel-frequency-scaled true spectral envelopes of the natural and synthetic vowel sounds. PSO achieved the best results for 65% of the vowels, reducing the RMSE from 15dB to 3dB, while the best results for the remaining vowels were obtained by the GA. Furthermore, informal listening tests confirmed a substantial improvement in the quality of the synthetic vowels.},
isbn = {978-3-95908-834-3},
issn = {0940-6832},
keywords = {Speech Synthesis},
url = {https://www.essv.de/pdf/2026_102_111.pdf},
}