@InProceedings{Schuler2024_1219,
author = {Christian Schuler and Shravan Nayak and Debjoy Saha and Timo Baumann},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2024},
title = {Can We See Your Response Before You Speak? Exploring Linguistic Information Found in Inter-Turn Pauses},
year = {2024},
editor = {Timo Baumann},
month = mar,
pages = {165--172},
publisher = {TUDpress, Dresden},
abstract = {In this work we assess whether there is information in pauses in-between utterances of the same or different speakers that are predictive of the following speaker's utterance. We present models that connect a person's visual features before they speak to their upcoming utterance. In our experiments we find that outof- the-box pre-trained models can already reach a better-than-chance performance in correlating video embeddings to utterance embeddings. In contrast, models that attempt to predict the first word after the pause do not outperform a unigram model, indicating that our models do not read lips (based e.g. on co-articulation effects) but rather capture more fundamental aspects of the upcoming utterance.},
isbn = {978-3-95908-325-6},
issn = {0940-6832},
keywords = {Large Language Models},
url = {https://www.essv.de/pdf/2024_165_172.pdf},
doi = {10.35096/othr/pub-7094},
}