@InProceedings{Hinterleitner2011_14,
author = {Florian Hinterleitner and Steve Zabel and Sebastian Möller and Lutz Leutelt and Christoph Norrenbrock},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2011.},
title = {Predicting the quality of synthesized speech using reference-based prediction measures},
year = {2011},
editor = {Bernd J. Kröger and Peter Birkholz},
month = mar,
pages = {99--106},
publisher = {TUDpress, Dresden},
abstract = {This paper presents research on the use of methods for end-to-end speechquality assessment for the perceptual evaluation of text-to-speech (TTS) systems.We analyze the ITU-T Rec. P.862.2 Wideband Perceptual Evaluation of SpeechQuality (WB-PESQ) as well as its wideband optimized successor ITU-T Rec. P.863Perceptual Objective Listening Quality Assessment (POLQA), and the DiagnosticInstrumental Assessment of Listening quality (DIAL) algorithm (Côté et al., PQS2010). All measures were originally optimized for the evaluation of telephone networksand speech codecs; thus they are used out of their original domain. In additionto the to-be-evaluated TTS signal all measures also need a natural speechreference as input. The quality estimate is calculated by comparing the natural referencewith its corresponding synthetic speech signal.The measures are tested on data collected by the Blizzard Challenge (BC) in thepast years. BC is a competition for developers of speech synthesis systems withthe intent to train different systems on the same speech corpus and evaluate theirperformance. Thus, all synthesizers were built on the same voice.We use the natural speech reference and its corresponding TTS signal as input forthe above mentioned measures. The correlation between the calculated mean opinionscore (MOS) and the perceptually evaluated quality rating serves as an indicatorfor the accuracy of the prediction.The achieved results were disappointing throughout all databases. The main problemof all 3 algorithms seems to be an inaccurate time alignment between the naturalspeech file and its corresponding TTS sample. To fix this problem we proposea Dynamic Time Warping between both signals prior to the reference-based evaluation.},
isbn = {978-3-942710-37-4},
issn = {0940-6832},
keywords = {Sprachsynthese-Evaluation und Prosodie},
url = {https://www.essv.de/pdf/pdf/2011_99_106.pdf},
}