@InProceedings{Pfitzinger2010_536,
author = {Hartmut R. Pfitzinger},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2010},
title = {Shifting the Paradigm from Source-Filter-Modelling to Segmental-Suprasegmental Modelling of Speech.},
year = {2010},
editor = {Hansjörg Mixdorff},
month = mar,
pages = {31--32},
publisher = {TUDpress, Dresden},
abstract = {The human voice conveys linguistic content but also information about the speakers gender, age,
physique, and health situation, about his attitudes and emotions, about his personality, about his social
and educational background, about his dialect and places where he resided for a longer time, and also
about the interlocutor to whom he adapts his speaking style (which at least becomes obvious in the case
of child- or pet-directed speech). Many of these factors are discussed since a long time but it seems
impossible to list or even consider or investigate all factors determining the human voice, although the
number of studies concerned with interactions between someof these factors is continually increasing
during the last decade. However, the absolute numberof those studiesis still small. We claim thatthis is
mainly due to a lack of an appropriate speech model.
Well-established speech models providea strictly sequential concept from the "sender" to the "receiver"
via the speech production mechanism, the acoustic transmission, and the speech perception system.
They are not capable of directly representing the multi-dimensional interactions between the abovementionedfactors.
Therefore, the keynote talk presents a new model of speech proposing a paradigm shift from the
classical articulatory-acoustic view of speech, that is perfectly represented by the acoustical theory of
speech production and the well-established source-filter-model, towards a functional speech model
which separates each of both, the source as well as the filter, into two components: the segmental layer
and the suprasegmental layer. The segmental layer is concerned with those excitation signal variations
and vocal tract deformations induced by the articulatory rendering of words, whereas the suprasegmental
layer contains (besides the linguistic information that prosody provides) also and mainly non-linguistic
information, i.e. expressive, speaker-specific, para-, and extra-linguistic effects on the excitation signal
as well as on the vocaltract.
The model is established by a data-driven parameter decomposition approach based on mathematical
methods taken from linear algebra e.g. multiple linear regression (MLR), singular value decomposition
(SVD), principal component analysis (PCA), and linear predictive coding (LPC). Details ofthis
technique are presented at the conference. This functional speech model enables access to higher-level
knowledge and parameter control in automatic speech analysis, modification, and synthesis.},
isbn = {978-3-941298-85-9},
issn = {0940-6832},
keywords = {Keynote},
url = {https://www.essv.de/pdf/2010_31_32.pdf},
}