@InProceedings{Möller2010_546,
author = {Sebastian Möller and Florian Hinterleitner},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2010},
title = {Instrumental Evaluation of Synthesized Speech Quality},
year = {2010},
editor = {Hansjörg Mixdorff},
month = mar,
pages = {102--103},
publisher = {TUDpress, Dresden},
abstract = {Whereas methods for synthesizing speech signals from written text have made considerable
advances in the past decade, methods for assessing the performance of speech synthesizers
and evaluating their fitness for particular applicationsare still cumbersome. A reasonis that
quality assessment and evaluation require perception and judgmentprocesses to take place,
which ultimately happen only inside a human assessor. Thus, auditory test methods are
currently the only wayto validly and reliably assess and evaluate synthesized speech quality.
Still, advances in speech transmission quality prediction show up ways to model perception
and judgmentprocessesto a limited extent, and thusto predict speech transmission quality on
the basis of instrumental measurements only. Weare thus interested in the question whether
such an approach is also feasible with synthesized speech signals and their corresponding
degradationsoriginating from the synthesis process.
In this talk, we will discuss several such ways and analyze their (expected) performance on
different synthesized speech databases. First, we will briefly review approaches which rely on
the availability of a natural reference speech signal, and compare synthesized speech signals
to such natural references [1]. This approach is limited by the (non-) availability of natural
speech data usually required from the same speaker the synthesis inventory has been built
from. Second, we will address approaches which rely on a model of natural speech, and
derive quality predictions on thebasis of the similarity of the synthesized speech signalto this
model[2, 3, 4]. Third, we will review approaches extracting parameters from the synthesized
speech signals which coincide with particular types of degradations [5]. Such approaches have
been successful with transmitted speech signals, but the parameters heavily depend of the
speech databases and speaker gender. Finally, we will show that improvements can be
reached by combining different types of approaches [6]. We will justify our claims on the
basis of empirical data from typical German synthesizers as well as from the Blizzard
Challenges organized as a controlled comparative assessment of synthesized speech. Wewill
address the deficiencies of the current approaches by showingthat their performance heavily
depends on the used databases, and will identify research which has to be carried outjointly
by the synthesis and evaluation communities to overcomethe currentlimitations.},
isbn = {978-3-941298-85-9},
issn = {0940-6832},
keywords = {Keynote},
url = {https://www.essv.de/pdf/2010_102_103.pdf},
}