@InProceedings{Gutscher2022_1163,
author = {Lorenz Gutscher and Michael Pucher},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2022},
title = {Improving the quality of synthesized speech of a Viennese dialect speaker through speaker adaptation},
year = {2022},
editor = {Oliver Niebuhr and Malin Svensson Lundmark and Heather Weston},
month = mar,
pages = {228--234},
publisher = {TUDpress, Dresden},
abstract = {Text-to-speech systems recently experienced a push towards Deep Neural
Network-based approaches that achieve high quality for standard languages.
Such systems especially benefit from big datasets that often are not available for
dialects. By the example of a Standard Austrian German speaker and a Viennese
Dialect speaker, it is examined if a combined training can improve the quality of
the dialect speaker. We use the term “dialect” here because it is well known for
describing regional language variation, although the Viennese dialect is a sociolect
in the strict sense, since the variation is on a social dimension. It is shown that
the similarities between those varieties are sufficient to benefit from the additional
data. Using an open source neural network speech synthesis system [1], an average
voice model is built and afterwards used to fine-tune it to the Viennese Dialect
speaker. In a subjective listening test, participants are asked to rate stimuli in relation
to perceived naturalness. Synthetic audio samples of the proposed model
are judged as more natural compared to a baseline model where training is only
based on the Viennese Dialect speaker. An objective evaluation indicates that – in
reference to the natural recording – mel-cepstral distortion is roughly the same for
both systems. When fine-tuning the average voice to the Standard Austrian German
speaker, no remarkable benefits can be found. Additionally, it is tested whether a
method called multi-task learning can further improve the synthesis quality by using
additional pole-zero features for modeling nasal and lateral phones. Looking at
objective and subjective measures, we conclude that in a multi-task learning setting
the use of additional pole-zero features does not increase speech quality.},
isbn = {978-3-95908-548-9},
issn = {0940-6832},
keywords = {Prosody},
url = {https://www.essv.de/pdf/2022_228_234.pdf},
}