@InProceedings{Kraljevski2025_1242,
author = {Ivan Kraljevski and Frank Duckhorn and Daniel Sobe and Constanze Tschöpe and Matthias Wolff},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2025},
title = {Speech-to-text in upper Sorbian: current state},
year = {2025},
editor = {Sven Grawunder},
month = mar,
pages = {109--116},
publisher = {TUDpress, Dresden},
abstract = {This study presents recent advancements in Upper Sorbian
Speech-to-Text (STT) technology. We provide an overview of the Sorbian
languages, the available speech and language resources, and the development
of an STT system based on a traditional approach, which includes acoustic,
pronunciation, and language modeling.
Due to the scarcity of resources for Sorbian languages, our approach leverages
sub-word and word-class modeling techniques. The word-class modeling is based
on Finite-State Transducer definitions, which are applicable to both offline text
parsing and integration into the decoding graph of the STT system. Word-class
parsing is performed on the speech corpus and utilized for language modeling with
complete words, sub-word units, or both. Additionally, the same definitions can
be applied to Named Entity Recognition during the post-processing of recognized
transcriptions.
This approach significantly reduces out-of-vocabulary words and enables greater
customization of the recognizer for domain-specific applications. The system was
implemented for the real-time transcription of church sermon broadcasts in Upper Sorbian. The domain-specific system achieved performance comparable to
fine-tuned OpenAI Whisper models developed also by other initiatives while also
providing a resource-efficient solution with semantically tagged recognition results.},
isbn = {978-3-95908-803-9},
issn = {0940-6832},
keywords = {Benchmarking ASR and TTS},
url = {https://www.essv.de/pdf/2025_109_116.pdf},
}