@InProceedings{Winkler2025_1257,
author = {Lisa Winkler and Melanie Schindler and Aaricia Herygers and Christian Gaida and Felix Gräßer and Rico Petrick and Frank Eisenhaber and Matthias Henker},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2025},
title = {Modular text normalization pipeline for language model training},
year = {2025},
editor = {Sven Grawunder},
month = mar,
pages = {231--238},
publisher = {TUDpress, Dresden},
abstract = {Language modeling plays an integral part in natural language processing tasks, and speech recognition applications especially require clean data for cohesive results. Most existing text normalization and data cleaning algorithms are strict end-to-end solutions and allow little customization. The presented text normalization pipeline is modular and configurable and can be applied to various text sources. The integration of additional steps into the pipeline that remove text garbage proved to be advantageous for text generation with language models that were generated solely with this pipeline. Furthermore, the combination of rule-based and machine learning processes proved to be effective in producing data faster than previous solutions. },
isbn = {978-3-95908-803-9},
issn = {0940-6832},
keywords = {Poster},
url = {https://www.essv.de/pdf/2025_231_238.pdf},
}