author = {Wentao Yu and Steffen Zeiler and Dorothea Kolossa},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2021},
title = {Towards reliability-guided information integration in audio-visual speech recognition},
year = {2021},
editor = {Stefan Hillmann and Benjamin Weiss and Thilo Michael and Sebastian Möller},
month = mar,
pages = {104--111},
publisher = {TUDpress, Dresden},
abstract = {Audio-visual speech recognition can improve the recognition rate in many small-vocabulary tasks. But for large vocabularies, due to difficulties like unsatisfactory lipreading accuracies, improving the recognition rate over audio-only baselines remains difficult. In this work, we propose a new fusion strategy, which fuses the state posteriors of separate stream recognizers through a bidirectional LSTM network. Our proposed fusion strategy outperforms all baselines as well as oracle dynamic stream-weighting, which gives a theoretical upper bound for dynamic stream-weighting approaches. The proposed system achieves a relative word error rate reduction of 42.18% compared to the audio-only setup and 34.73% compared to the non-oracle dynamic stream-weighting baseline.},
isbn = {978-3-959082-27-3},
issn = {0940-6832},
keywords = {Automatische Spracherkennung},
url = {https://www.essv.de/pdf/2021_104_111.pdf},