author = {Sebastian Möller and Florian Hinterleitner},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2010},
title = {Instrumental Evaluation of Synthesized Speech Quality},
year = {2010},
editor = {Hansjörg Mixdorff},
month = mar,
pages = {102--103},
publisher = {TUDpress, Dresden},
abstract = {Whereas methods for synthesizing speech signals from written text have made considerable advances in the past decade, methods for assessing the performance of speech synthesizers and evaluating their fitness for particular applicationsare still cumbersome. A reasonis that quality assessment and evaluation require perception and judgmentprocesses to take place, which ultimately happen only inside a human assessor. Thus, auditory test methods are currently the only wayto validly and reliably assess and evaluate synthesized speech quality. Still, advances in speech transmission quality prediction show up ways to model perception and judgmentprocessesto a limited extent, and thusto predict speech transmission quality on the basis of instrumental measurements only. Weare thus interested in the question whether such an approach is also feasible with synthesized speech signals and their corresponding degradationsoriginating from the synthesis process. In this talk, we will discuss several such ways and analyze their (expected) performance on different synthesized speech databases. First, we will briefly review approaches which rely on the availability of a natural reference speech signal, and compare synthesized speech signals to such natural references [1]. This approach is limited by the (non-) availability of natural speech data usually required from the same speaker the synthesis inventory has been built from. Second, we will address approaches which rely on a model of natural speech, and derive quality predictions on thebasis of the similarity of the synthesized speech signalto this model[2, 3, 4]. Third, we will review approaches extracting parameters from the synthesized speech signals which coincide with particular types of degradations [5]. Such approaches have been successful with transmitted speech signals, but the parameters heavily depend of the speech databases and speaker gender. Finally, we will show that improvements can be reached by combining different types of approaches [6]. We will justify our claims on the basis of empirical data from typical German synthesizers as well as from the Blizzard Challenges organized as a controlled comparative assessment of synthesized speech. Wewill address the deficiencies of the current approaches by showingthat their performance heavily depends on the used databases, and will identify research which has to be carried outjointly by the synthesis and evaluation communities to overcomethe currentlimitations.},
isbn = {978-3-941298-85-9},
issn = {0940-6832},
keywords = {Keynote},
url = {https://www.essv.de/pdf/2010_102_103.pdf},