@InProceedings{Höge2016_323,
author = {Harald Höge},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2016},
title = {The Statistics and Phone Error Rates of Bark-Features},
year = {2016},
editor = {Oliver Jokisch},
month = mar,
pages = {53--60},
publisher = {TUDpress, Dresden},
abstract = {We simulate the process of feature processing as assumed to be done in the
human brain. The simulation is based on the principle that the features are processed
independently in critical bands. 30 critical bands are realized by a Gammatone filterbank.
The output of each band is segmented into phones. From each segment and each band a
‘modulation feature vector’ is extracted assuming that the spectrum of modulation is
stationary during the duration of a phone. Using GMMs trained on those modulation
features, a recognizer is constructed for each filter. The segments are classified1 into
phonemes leading to a phone error rate per band. Given the emission probabilities of the
GMMs the probabilities for each phone and for each band are determined. In our approach
these probabilities build the components of a ‘phone feature vector’, which is assumed to be
processed in the auditory cortex. To the authors knowledge the transformation of the
modulation features to phone features is unknown neuro-physically. Yet from perceptive
experiments we know some statistic properties of the phone features concerning the relation
between the human phone error rate per band and the human error rate of unfiltered phones
[2]. To evaluate this relation we combine the 30 phone feature vectors and construct an ‘allband’
recognizer. Classification of segments of Spanish speech using 32 phones leads to an
‘all band’ phone error rate of 48% and a phone error rate per band of about 92%. These error
rates deviate significantly from human performance.},
isbn = {978-3-959080-40-8},
issn = {0940-6832},
keywords = {Spracherkennung und Dialogsysteme},
url = {https://www.essv.de/pdf/2016_53_60.pdf},
}