@InProceedings{Ghahabi2018_393,
author = {Omid Ghahabi and Wei Zhou and Volker Fischer},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2018},
title = {A Robust Voice Activity Detection for Real-Time Automatic Speech Recognition},
year = {2018},
editor = {André Berton and Udo Haiber and Wolfgang Minker},
month = mar,
pages = {85--91},
publisher = {TUDpress, Dresden},
abstract = {Voice Activity Detection (VAD), locating speech segments within an au-
dio recording, is a main part of most speech technology applications. Non-speech
segments, e.g., silence, noise, and music, usually do not carry any interesting infor-
mation in speech recognition applications and they even degrade the performance
of the recognition system in terms of both the accuracy and computational cost.
Various VAD techniques have been developed, but not all of them are appropriate
for a real-time application where the robustness, accuracy, and the processing time
are the main keys. In this paper, we propose a fast and robust VAD for a real-time
Automatic Speech Recognition (ASR) task. The main goal is to efficiently filter
out the non-speech segments before processing the speech segments of the audio
signal by the decoder. The proposed technique is a hybrid supervised/unsupervised
model based on zero-order Baum-Welch statistics obtained from a Universal Back-
ground Model (UBM).We will show that not only the processing time for the whole
speech recognition task is decreased by 39%, but also the Word Error Rate (WER)
is reduced by about 1.9% relative.},
isbn = {978-3-959081-28-3},
issn = {0940-6832},
keywords = {Poster},
url = {https://www.essv.de/pdf/2018_85_91.pdf},
}