@InProceedings{Huang2026_1299,
<br/>   author = {Zihao Huang and Tianyi Zhang and Peter Birkholz},
<br/>   booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2026, Tagungsband der 37. Konferenz},
<br/>   title = {Self-Supervised Multi-Task Learning for Enhanced Prosody Prediction in German Articulatory Speech Synthesis},
<br/>   year = {2026},
<br/>   editor = {Günther Wirsching},
<br/>   month = mar,
<br/>   pages = {224--231},
<br/>   publisher = {TUDpress, Dresden},
<br/>   abstract  = {This paper presents a systematic comparison of self-supervised pre-training strategies for prosody modelling. We evaluated three pretext tasks within a unified LSTM-based architecture. The pre-trained encoder is integrated into a multi-task prosody model that jointly predicts phoneme duration, fundamental frequency ( f0), and voicing. Objective evaluation showed that all pre-training methods improve prosody prediction compared to a baseline, particularly for pitch. Subjective listening tests, however, revealed no significant differences in perceived naturalness, indicating that objective gains do not always translate into perceptual advantage. These findings demonstrate that self-supervised pre-training enhances prosody prediction, while perceptual benefits depend on specific aspects of prosodic realization.},
<br/>   isbn = {978-3-95908-834-3},
<br/>   issn = {0940-6832},
<br/>   keywords = {Posters},
<br/>   url = {https://www.essv.de/pdf/2026_224_231.pdf},
<br/>}