@ARTICLE{Sakoe1978, title = {Dynamic programming algorithm optimization for spoken word recognition}, author = {Sakoe, H. and Chiba, S.}, journal = {Acoustics, Speech, and Signal Processing [see also {IEEE} Transactions on Signal Processing], {IEEE} Transactions on}, year = {1978}, volume = {26}, number = {1}, pages = {43--49}, month = feb, abstract = {This paper reports on an optimum dynamic progxamming (DP) based time- normalization algorithm for spoken word recognition. First, a general principle of time-normalization is given using time- warping function. Then, two time-normalized distance definitions, called symmetric and asymmetric forms, are derived from the principle. These two forms are compared with each other through theoretical discussions and experimental studies. The symmetric form algorithm superiority is established. A new technique, called slope constraint, is successfully introduced, in which the warping function slope is restricted so as to improve discrimination between words in different categories. The effective slope constraint characteristic is qualitatively analyzed, and the optimum slope constraint condition is determined through experiments. The optimized algorithm is then extensively subjected to experimental comparison with various DP-algorithms, previously applied to spoken word recognition by different research groups. The experiment shows that the present algorithm gives no more than about two-thirds errors, even compared to the best conventional algorithm.}, ISSN = {0096-3518}, } @ARTICLE{Itakura1975, title = {Minimum prediction residual principle applied to speech recognition}, author = {Itakura, F.}, journal = {Acoustics, Speech, and Signal Processing [see also {IEEE} Transactions on Signal Processing], {IEEE} Transactions on}, year = {1975}, volume = {23}, number = {1}, pages = {67--72}, month = feb, abstract = {A computer system is described in which isolated words, spoken by a designated talker, are recognized through calculation of a minimum prediction residual. A reference pattern for each word to be recognized is stored as a time pattern of linear prediction coefficients (LPC). The total log prediction residual of an input signal is minimized by optimally registering the reference LPC onto the input autocorrelation coefficients using the dynamic programming algorithm (DP). The input signal is recognized as the reference word which produces the minimum prediction residual. A sequential decision procedure is used to reduce the amount of computation in DP. A frequency normalization with respect to the long-time spectral distribution is used to reduce effects of variations in the frequency response of telephone connections. The system has been implemented on a DDP-516 computer for the 200-word recognition experiment. The recognition rate for a designated male talker is 97.3 percent for telephone input, and the recognition time is about 22 times real time.}, ISSN = {0096-3518}, } @article{Velichko, author = {V. M. Velichko and N. G. Zagoruyko}, title = {Automatic Recognition of 200 Words}, journal = {International Journal of Man-Machine Studies}, volume = {2}, issue = {3}, year = {1970}, pages = {223-234}, bibsource = {http://www.interaction-design.org/references/}, } @ARTICLE{White1976, title = {Speech recognition experiments with linear predication, bandpass filtering, and dynamic programming}, author = {White, G. and Neely, R.}, journal = {Acoustics, Speech, and Signal Processing [see also {IEEE} Transactions on Signal Processing], {IEEE} Transactions on}, year = {1976}, volume = {24}, number = {2}, pages = {183--188}, month = apr, abstract = {Automatic speech recognition experiments are described in which several popular preprocessing and classification strategies are compared. Preprocessing is done either by linear predictive analysis or by bandpass filtering. The two approaches are shown to produce similar recognition scores. The classifier uses either linear time stretching or dynamic programming to achieve time alignment. It is shown that dynamic programming is of major importance for recognition of polysyllabic words. The speech is compressed into a quasi-phoneme character string or preserved uncompressed. Best results are obtained with uncompressed data, using nonlinear time registration for multisyllabic words.}, ISSN = {0096-3518}, }