@InProceedings{Winkler2026_1276,
author = {Lisa Winkler and Andreas Wendemuth},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2026, Tagungsband der 37. Konferenz},
title = {An Approach to Improving Robustness in Dynamic Acoustic Environments: Context Noise Representation Learning for Urban Speech Emotion Recognition},
year = {2026},
editor = {Günther Wirsching},
month = mar,
pages = {40--46},
publisher = {TUDpress, Dresden},
abstract = {In modern urban environments, speech recognition systems often face significant degradation due to background noise. Conventional approaches often rely on signal enhancement or generative error correction, which can inadvertently remove high-level emotional cues essential for understanding user intent. In this work, we propose a context noise representation learning (CNRL) framework that enhances robustness by aligning noisy speech representations with their clean counterparts in the latent space. By leveraging the conversational context and a feature fusion strategy, our model learns to recover clean emotional features. Evaluated on the IEMOCAP dataset using a strict Leave-One-Session-Out (LOSO) protocol, our method demonstrates improved robustness in low-SNR conditions compared to baseline approaches.},
isbn = {978-3-95908-834-3},
issn = {0940-6832},
keywords = {Speech Signal Recognition and Enhancement},
url = {https://www.essv.de/pdf/2026_40_46.pdf},
}