@InProceedings{Khatri2025_1231,
author = {Phrashant Khatri and Hansjörg Mixdorff and Preeti Rao and Albert Rilliard},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2025},
title = {Recognition of audio-visual attitudes},
year = {2025},
editor = {Sven Grawunder},
month = mar,
pages = {19--26},
publisher = {TUDpress, Dresden},
abstract = {We investigate audiovisual features for classifying attitudes, a key aspect
of communication that has been less studied than emotion recognition. Using a German
audiovisual dataset labeled with speaker intention and perceived attitude, we test
acoustic and visual features that have achieved state-of-the-art emotion recognition
results. Our classification achieves performance significantly above chance for 16
attitudes, closely aligning with perceptual ratings in diversity across attitudes and
speakers. We emphasize the challenges of processing nuanced expressions compared to
prototypical emotions. While audiovisual classifications outperform humans in some
areas, they fall short of fully leveraging the combined strengths of audio and visual
cues. This study highlights the potential for improved cross-modal fusion and calls for
further research on visual feature extraction in affective studies.},
isbn = {978-3-95908-803-9},
issn = {0940-6832},
keywords = {Multimodal Perception of Speech and Non-verbal Cues},
url = {https://www.essv.de/pdf/2025_19_26.pdf},
}