@article{ZNSL_2023_529_a3,
author = {D. Grebenkin and I. Bondarenko},
title = {Wav2Vec2 without {Attention:} do you need {Hopfield} {Networks} for {Self-Supervised} {Learning} of {Speech} {Representations?}},
journal = {Zapiski Nauchnykh Seminarov POMI},
pages = {43--53},
year = {2023},
volume = {529},
language = {en},
url = {http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a3/}
}
TY - JOUR AU - D. Grebenkin AU - I. Bondarenko TI - Wav2Vec2 without Attention: do you need Hopfield Networks for Self-Supervised Learning of Speech Representations? JO - Zapiski Nauchnykh Seminarov POMI PY - 2023 SP - 43 EP - 53 VL - 529 UR - http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a3/ LA - en ID - ZNSL_2023_529_a3 ER -
%0 Journal Article %A D. Grebenkin %A I. Bondarenko %T Wav2Vec2 without Attention: do you need Hopfield Networks for Self-Supervised Learning of Speech Representations? %J Zapiski Nauchnykh Seminarov POMI %D 2023 %P 43-53 %V 529 %U http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a3/ %G en %F ZNSL_2023_529_a3
D. Grebenkin; I. Bondarenko. Wav2Vec2 without Attention: do you need Hopfield Networks for Self-Supervised Learning of Speech Representations?. Zapiski Nauchnykh Seminarov POMI, Investigations on applied mathematics and informatics. Part II–1, Tome 529 (2023), pp. 43-53. http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a3/
[1] ml-jku/hopfield-layers: Hopfield networks is all you need, (Last access: 2023-02-15) https://github.com/ml-jku/hopfield-layers
[2] Nvidia/stt-ru-conformer-transducer-large $\cdot$ hugging face, (Last access: 2023-01-30) https://huggingface.co/nvidia
[3] sovaai/sova-dataset, (Last access: 2023-02-15) https://github.com/sovaai/sova-dataset
[4] voxforge.org, (Last access: 2023-02-15) http://www.voxforge.org/ru
[5] A. Baevski, H. Zhou, A. Mohamed, M. Auli, wav2vec 2.0: A framework for self-supervised learning of speech representations, 2020
[6] S. Bhattamishra, K. Ahuja, N. Goyal, “On the ability and limitations of transformers to recognize formal languages”, Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, 2020, 7096–7116
[7] I. Bondarenko, Xlsr wav2vec2 russian, , 2022 https://huggingface.co/bond005/wav2vec2-large-ru-golos
[8] A. Conneau, A. Baevski, R. Collobert, A. Mohamed, M. Auli, Unsupervised cross-lingual representation learning for speech recognition, 2020
[9] L. Dong, S. Xu, Bo Xu, “Speech-transformer: A no-recurrence sequence-to-sequence model for speech recognition”, 2018 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), 2018, 5884–5888
[10] A. Fan, T. Lavril, E. Grave, A. Joulin, S. Sukhbaatar, Addressing some limitations of transformers with feedback memory, 2020
[11] E. Fonseca, M. Plakal, F. Font, D. P. W. Ellis, X. Serra, Audio tagging with noisy labels and minimal supervision, 2020
[12] A. Gulati, J. Qin, Chung-Cheng Chiu, N. Parmar, Y. Zhang, J. Yu, W. Han, S. Wang, Z. Zhang, Y. Wu, R. Pang, Conformer: Convolution-augmented transformer for speech recognition, 2020
[13] J. Hopfield, “Neural networks and physical systems with emergent collective computational abilities”, Proceedings of the National Academy of Sciences of the United States of America, 79 (1982), 2554–8
[14] N. Karpov, A. Denisenko, F. Minkin, Golos: Russian dataset for speech research, 2021
[15] D. Krotov, J. Hopfield, Dense associative memory for pattern recognition, 2016
[16] L. Kürzinger, D. Winkelbauer, L. Li, T. Watzel, G. Rigoll, “Ctc-segmentation of large corpora for german end-to-end speech recognition”, Speech and Computer, eds. Alexey Karpov, Rodmonga Potapova, Springer International Publishing, Cham, 2020, 267–278
[17] Y. Lecun, B. Boser, J. Denker, D. Henderson, R. Howard, W. E. Hubbard, L. Jackel, “Backpropagation applied to handwritten zip code recognition”, Neural Computation, 1 (1989), 541–551
[18] M. I. Matusevich, Modern russian language. Phonetics, Prosveshchenie Publ, 1976
[19] L. McInnes, J. Healy, J. Melville, Umap: Uniform manifold approximation and projection for dimension reduction, 2018
[20] D. Povey, A. Ghoshal, G. Boulianne, L. Burget, O. Glembek, N. Goel, M. Hannemann, P. Motlicek, Y. Qian, P. Schwarz, J. Silovsky, G. Stemmer, K. Vesel, “The kaldi speech recognition toolkit”, IEEE 2011 Workshop on Automatic Speech Recognition and Understanding, 2011
[21] L. Y. Pratt, “Discriminability-based transfer between neural networks”, Proceedings of the 5th International Conference on Neural Information Processing Systems, NIPS'92 (San Francisco, CA, USA), Morgan Kaufmann Publishers Inc, 1992, 204–211
[22] H. Ramsauer, B. Schäfl, J. Lehner, P. Seidl, M. Widrich, L. Gruber, M. Holzleitner, M. Pavlovic, G. Sandve, V. Greiff, D. Kreil, M. Kopp, G. Klambauer, J. Brandstetter, S. Hochreiter, Hopfield networks is all you need, 2020
[23] D. Varis, O. Bojar, Sequence length is a domain: Length-based overfitting in transformer models, 2021
[24] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, I. Polosukhin, Attention is all you need, 2017
[25] P. D. Wasserman, Neural computing: theory and practice, Van Nostrand Reinhold Co., New York, NY, USA, 1989
[26] O. Yakovenko, I. Bondarenko, M. Borovikova, D. Vodolazsky, “Algorithms for automatic accentuation and transcription of russian texts in speech recognition systems”, Speech and Computer, eds. Alexey Karpov, Oliver Jokisch, Rodmonga Potapova, Springer International Publishing, Cham, 2018, 768–777
[27] Q. Zhang, Han Lu, H. Sak, A. Tripathi, E. McDermott, S. Koo, S. Kumar, Transformer transducer: A streamable speech recognition model with transformer encoders and rnn-t loss, 2020