@InProceedings{Gao2020_437,
author = {Yingming Gao and Peter Steiner and Peter Birkholz},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2020},
title = {Articulatory Copy Synthesis using Long-Short Term Memory Networks},
year = {2020},
editor = {Andreas Wendemuth and Ronald Böck and Ingo Siegert},
month = mar,
pages = {52--59},
publisher = {TUDpress, Dresden},
abstract = {Investigating speech imitation, in particular articulatory copy synthesis,
benefits the understanding of speech production and can improve speech recognition
and synthesis. We proposed a framework for copy synthesis with an artificial
neural network (LSTM regression model) and an articulatory speech synthesizer
(VocalTractLab), which were responsible for the acoustic-to-articulatory mapping
and the inversion, respectively. We used rule-based method to create gestural scores
from texts, which were converted to articulatory trajectories and subsequently simulated
to produce the corresponding acoustic signals. To make the subsequent
mapping more robust, we expanded the acoustic and articulatory space by manipulating
speaking effort, voice quality, pitch level, and vocal tract length of the
created gestural scores or acoustic signals, producing 81 variants for each utterance.
With acoustic features as input and articulatory trajectories as output, we
trained the LSTM models to build the acoustic-to-articulatory inversion. For testing,
we estimated the articulatory trajectories from acoustic features, thus obtaining
the underlying articulatory process. The experiments showed that the correlation
coefficients (between estimated articulatory trajectories from acoustic features and
the real ones converted from gestural scores) ranged from 0.18 to 0.973 and the root
mean square error (RMSE) ranged from 0.043 to 0.255 for the concerned 30 articulatory
parameters of VocalTractLab. The estimated articulatory parameters were
further fed into VocalTractLab, whose output speech achieved a word recognition
accuracy of 17.24%.},
isbn = {978-3-959081-93-1},
issn = {0940-6832},
keywords = {Speech Synthesis},
url = {https://www.essv.de/pdf/2020_52_59.pdf},
}