@InProceedings{Harding2018_391,
author = {Philip Harding and Matthew Gibson},
booktitle = {Studientexte zur Sprachkommunikation: Elektronische Sprachsignalverarbeitung 2018},
title = {Unsupervised Neural-Network Based Vocal Tract Length Normalization},
year = {2018},
editor = {André Berton and Udo Haiber and Wolfgang Minker},
month = mar,
pages = {70--76},
publisher = {TUDpress, Dresden},
abstract = {In this paper an efficient, unsupervised, method of warp factor estimation
for vocal tract length normalisation (VTLN) is proposed. VTLN is a method
of feature-based speaker normalisation where the frequency spectrum is warped
to produce speaker-independent spectral features that are invariant to vocal tract
length. The degree to which the spectrum is warped is determined by the warping
factor, and it is the estimation of this warping factor that is the focus of this paper.
The warping factor is typically obtained using a maximum likelihood-based technique
that requires a state alignment for each utterance and a GMM acoustic model
trained on warped features. The warp factor is typically quantised, with one of
N warp factors selected for each utterance. The proposed method of warp factor
estimation makes use of a small neural network, trained on un-warped features,
to directly estimate the quantised warp factor. Experimental results are presented
where, unlike previously published methods of unsupervised warp factor estimation
[1, 2], the proposed method is shown to give equivalent performance to the
typical supervised GMM-based method in terms of ASR accuracy at a significantly
lower computational cost.},
isbn = {978-3-959081-28-3},
issn = {0940-6832},
keywords = {Poster},
url = {https://www.essv.de/pdf/2018_70_76.pdf},
}