@article{ZNSL_2024_540_a11,
author = {N. Sukhanovskii and M. Ryndin},
title = {MMA: a fight for multilingual models acceleration},
journal = {Zapiski Nauchnykh Seminarov POMI},
pages = {214--232},
year = {2024},
volume = {540},
language = {en},
url = {http://geodesic.mathdoc.fr/item/ZNSL_2024_540_a11/}
}
N. Sukhanovskii; M. Ryndin. MMA: a fight for multilingual models acceleration. Zapiski Nauchnykh Seminarov POMI, Investigations on applied mathematics and informatics. Part IV, Tome 540 (2024), pp. 214-232. http://geodesic.mathdoc.fr/item/ZNSL_2024_540_a11/
[1] Z. Yang, Y. Cui, and Z. Chen, “TextPruner: A Model Pruning Toolkit for Pre-Trained Language Models”, Proc. 60th Annual Meet. Assoc. Comput. Linguist.: Syst. Demonstr, 2022, 35–43 | DOI
[2] J. Devlin, M.-W. Chang, K. Lee, and K. Toutanova, BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding, 2019, arXiv: 1810.04805
[3] P. Michel, O. Levy, and G. Neubig, Are Sixteen Heads Really Better than One?, Adv. Neural Inf. Process. Syst., 32 (2019)
[4] E. Voita, D. Talbot, F. Moiseev, R. Sennrich, and I. Titov, “Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned”, Proc. 57th Annual Meet. Assoc. Comput. Linguist., 2019, 5797–5808 | DOI
[5] V. Sanh, L. Debut, J. Chaumond, and T. Wolf, DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter, 2019, arXiv: 1910.01108
[6] F. Lagunas, E. Charlaix, V. Sanh, and A.M. Rush, Block Pruning For Faster Transformers, 2021, arXiv: 2109.04838 | Zbl
[7] J.S. McCarley, Pruning a BERT-based Question Answering Model, 2019, arXiv: 1910.06360
[8] T. Hoefler, D. Alistarh, T. Ben-Nun, N. Dryden, and A. Peste, “Sparsity in deep learning: Pruning and growth for efficient inference and training in neural networks”, J. Mach. Learn. Res., 22:1 (2021), 10882–11005 | MR
[9] E. Kurtic, D. Campos, T. Nguyen, E. Frantar, M. Kurtz, B. Fineran, M. Goin, and D. Alistarh, The Optimal BERT Surgeon: Scalable and Accurate Second-Order Pruning for Large Language Models, 2022, arXiv: 2203.07259
[10] L.N. Smith, No More Pesky Learning Rate Guessing Games, 2015, arXiv: 1506.01186
[11] E. Kurtic, D.F. Campos, T. Nguyen, E. Frantar, M. Kurtz, B. Fineran, M. Goin, and D. Alistarh, “The Optimal BERT Surgeon: Scalable and Accurate Second-Order Pruning for Large Language Models”, Conf. Empir. Methods Nat. Lang. Process, 2022
[12] W. Wang, F. Wei, L. Dong, H. Bao, N. Yang, and M. Zhou, MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers, 2020, arXiv: 2002.10957
[13] Y. Liu, M. Ott, N. Goyal, J. Du, M. Joshi, D. Chen, O. Levy, M. Lewis, L. Zettlemoyer, and V. Stoyanov, RoBERTa: A Robustly Optimized BERT Pretraining Approach, 2019, arXiv: 1907.11692
[14] S.P. Singh and D. Alistarh, WoodFisher: Efficient Second-Order Approximation for Neural Network Compression, 2020, arXiv: 2004.14340
[15] B. Hassibi and D.G. Stork, “Second Order Derivatives for Network Pruning: Optimal Brain Surgeon”, Adv. Neural Inf. Process. Syst., 1992
[16] X. Jiao, Y. Yin, L. Shang, X. Jiang, X. Chen, L. Li, F. Wang, and Q. Liu, TinyBERT: Distilling BERT for Natural Language Understanding, 2020, arXiv: 1909.10351
[17] E.F. Tjong Kim Sang, Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition, 2002, arXiv: cs/0209010 | Zbl
[18] A. Conneau, K. Khandelwal, N. Goyal, V. Chaudhary, G. Wenzek, F. Guzmán, E. Grave, M. Ott, L. Zettlemoyer, and V. Stoyanov, Unsupervised Cross-lingual Representation Learning at Scale, 2020, arXiv: 1911.02116
[19] T. Pires, E. Schlinger, and D. Garrette, How multilingual is Multilingual BERT?, 2019, arXiv: 1906.01502
[20] Z. Yang, Y. Cui, Z. Chen, W. Che, T. Liu, S. Wang, and G. Hu, “TextBrewer: An Open-Source Knowledge Distillation Toolkit for Natural Language Processing”, Proc. 58th Annual Meet. Assoc. Comput. Linguist.: Syst. Demonstr, 2020, 9–16 | DOI
[21] E.F. Tjong Kim Sang and F. De Meulder, Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition, 2003, arXiv: cs/0306050
[22] W. Wang, F. Wei, L. Dong, H. Bao, N. Yang, and M. Zhou, MiniLM: Deep Self-Attention Distillation for Task-Agnostic Compression of Pre-Trained Transformers, 2020, arXiv: 2002.10957
[23] T. Gale, E. Elsen, and S. Hooker, The State of Sparsity in Deep Neural Networks, 2019, arXiv: 1902.09574
[24] H. Wu, P. Judd, X. Zhang, M. Isaev, and P. Micikevicius, Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation, 2010, arXiv: 2004.09602