@InProceedings{Khatri2025_1231,
author = {Phrashant Khatri and Hansjörg Mixdorff and Preeti Rao and Albert Rilliard},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2025},
title = {Recognition of audio-visual attitudes},
year = {2025},
editor = {Sven Grawunder},
month = mar,
pages = {19--26},
publisher = {TUDpress, Dresden},
abstract = {We investigate audiovisual features for classifying attitudes, a key aspect of communication that has been less studied than emotion recognition. Using a German audiovisual dataset labeled with speaker intention and perceived attitude, we test acoustic and visual features that have achieved state-of-the-art emotion recognition results. Our classification achieves performance significantly above chance for 16 attitudes, closely aligning with perceptual ratings in diversity across attitudes and speakers. We emphasize the challenges of processing nuanced expressions compared to prototypical emotions. While audiovisual classifications outperform humans in some areas, they fall short of fully leveraging the combined strengths of audio and visual cues. This study highlights the potential for improved cross-modal fusion and calls for further research on visual feature extraction in affective studies.},
isbn = {978-3-95908-803-9},
issn = {0940-6832},
keywords = {Multimodal Perception of Speech and Non-verbal Cues},
url = {https://www.essv.de/pdf/2025_19_26.pdf},
}