@InProceedings{Mimer2004_505,
author = {Borislava Mimer and Sebastian Stüker and Tanja Schultz},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2004},
title = {Flexible Decision Trees for Grapheme Based Speech Recognition},
year = {2004},
editor = {Klaus Fellbaum},
month = mar,
pages = {79--86},
publisher = {TUDpress, Dresden},
abstract = {Over the last decades research in the field of automatic speech recognition
(ASR) has seen enormous progress. Speech recognition systems are now
deployed in real world applications, such as commercial software systems on PCs
or Workstations, embedded in consumer devices such as cell phones or car navigation
systems, or as part of specialized appliances.
With this increasing economic relevance it has become more and more important
to be able to rapidly extend speech recognition systems by new words, or to port
them to new, previously unseen languages or domains. Hereby the speed and cost
of development are of great importance.
One of the most labor and cost intensive components of a speech recognition system
is the pronunciation dictionary. Its creation often requires the application of linguistic
knowledge. Even though automatic procedures for the creation of phoneme
based dictionaries exist, they often require manual postprocessing by experts. This
manual postprocessing is expensive and time intensiv.
Therefore, the use of grapheme based speech recognizers has seen increased research
lately [1, 2, 3]. Hereby words are segmented into graphemes instead of
phonemes. The use of graphemes as modeling units has the advantage over the use
of phonemes that it makes the creation of the pronunciation dictionary a trivial task,
saving time and money.
While a phoneme sequence is designed to describe the pronunciation of a word,
the relation between the grapheme sequence of a word and its pronunciation is
highly dependent on the writing system of the language in question and can be
rather loosely coupled. Therefore, the context depending modeling of the units
und the sharing of parameters are of central importance. Also, one does not obtain
pronunciation variants when using grapheme based pronunciation dictionaries.
However, recent experiments have shown that graphemes are equally well suited as
phonemes for specific languages [2, 3].
In this article we investigate the potential of a flexible decision tree clustering
scheme for context dependent modeling as proposed by Hua et al. for grapheme
based speech recognition. To do so we trained grapheme based speech recognizers
in two languages — English and German — and compared the word error rates
when using the regular clustering procedure to when using the flexible clustering.
Through the use of the enhanced clustering procedure we were able to reduce the
word error rate of the grapheme beased recognizer by up to 9.3% relative, showing
that for German and English graphemes are suited as units for implicit pronunciation
modeling.},
isbn = {978-3-937672-65-6},
issn = {0940-6832},
keywords = {Spracherkennung},
url = {https://www.essv.de/pdf/2004_79_86.pdf},
}