@InProceedings{Huang2026_1299,
author = {Zihao Huang and Tianyi Zhang and Peter Birkholz},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2026, Tagungsband der 37. Konferenz},
title = {Self-Supervised Multi-Task Learning for Enhanced Prosody Prediction in German Articulatory Speech Synthesis},
year = {2026},
editor = {Günther Wirsching},
month = mar,
pages = {224--231},
publisher = {TUDpress, Dresden},
abstract = {This paper presents a systematic comparison of self-supervised pre-training strategies for prosody modelling. We evaluated three pretext tasks within a unified LSTM-based architecture. The pre-trained encoder is integrated into a multi-task prosody model that jointly predicts phoneme duration, fundamental frequency ( f0), and voicing. Objective evaluation showed that all pre-training methods improve prosody prediction compared to a baseline, particularly for pitch. Subjective listening tests, however, revealed no significant differences in perceived naturalness, indicating that objective gains do not always translate into perceptual advantage. These findings demonstrate that self-supervised pre-training enhances prosody prediction, while perceptual benefits depend on specific aspects of prosodic realization.},
isbn = {978-3-95908-834-3},
issn = {0940-6832},
keywords = {Posters},
url = {https://www.essv.de/pdf/2026_224_231.pdf},
}