@InProceedings{Schuler2024_1219,
<br/>   author = {Christian Schuler and Shravan Nayak and Debjoy Saha and Timo Baumann},
<br/>   booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2024},
<br/>   title = {Can We See Your Response Before You Speak? Exploring Linguistic Information Found in Inter-Turn Pauses},
<br/>   year = {2024},
<br/>   editor = {Timo Baumann},
<br/>   month = mar,
<br/>   pages = {165--172},
<br/>   publisher = {TUDpress, Dresden},
<br/>   abstract  = {In this work we assess whether there is information in pauses in-between
utterances of the same or different speakers that are predictive of the following
speaker&#039;s utterance. We present models that connect a person&#039;s visual features
before they speak to their upcoming utterance. In our experiments we find that outof-
the-box pre-trained models can already reach a better-than-chance performance
in correlating video embeddings to utterance embeddings. In contrast, models that
attempt to predict the first word after the pause do not outperform a unigram model,
indicating that our models do not read lips (based e.g. on co-articulation effects) but
rather capture more fundamental aspects of the upcoming utterance.},
<br/>   isbn = {978-3-95908-325-6},
<br/>   issn = {0940-6832},
<br/>   keywords = {Large Language Models},
<br/>   url = {https://www.essv.de/pdf/2024_165_172.pdf},
<br/>   doi = {10.35096/othr/pub-7094},
<br/>}