@article{ZNSL_2023_529_a4,
author = {D. Karpov and M. Burtsev},
title = {Monolingual and cross-lingual knowledge transfer for topic classification},
journal = {Zapiski Nauchnykh Seminarov POMI},
pages = {54--71},
year = {2023},
volume = {529},
language = {en},
url = {http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a4/}
}
D. Karpov; M. Burtsev. Monolingual and cross-lingual knowledge transfer for topic classification. Zapiski Nauchnykh Seminarov POMI, Investigations on applied mathematics and informatics. Part II–1, Tome 529 (2023), pp. 54-71. http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a4/
[1] F. Barrios, F. López, L. Argerich, R. Wachenchauzer, Variations of the similarity function of textrank for automated summarization, 2016, arXiv: 1602.03606
[2] D. Baymurzina, D. Kuznetsov, D. Evseev, D. Karpov, A. Sagirova, A. Peganov, F. Ignatov, E. Ermakova, D. Cherniavskii, S. Kumeyko, O. Serikov, Y. Kuratov, L. Ostyakova, D. Kornev, M. Burtsev, “Dream technical report for the alexa prize 4”, Alexa Prize SocialBot Grand Challenge 4 Proceedings, 2021
[3] V. Beaufils, J. Tomin, Stochastic approach to worldwide language classification: the signals and the noise towards long-range exploration, , 2020 http://www.elinguistics.net/Compare_Languages.aspx
[4] P. Blinov, Dataset of russian reviews about medical facilities, , 2022 (Accessed: 2023-02-17) https://huggingface.co/datasets/blinoff/healthcare_facilities_reviews
[5] V. Morris, D. van Strien, G. Tolfo, L. Afric, S. Robertson, P. Tiney, A. Dogterom, I. Wollner, 19th century books - metadata with additional crowdsourced annotations, 2021
[6] I. Chalkidis, A. Jana, D. Hartung, M. Bommarito, I. Androutsopoulos, D. M. Katz, N. Aletras, “Lexglue: A benchmark dataset for legal language understanding in english”, Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Dubln, Ireland), 2022
[7] A. Chizhikova, V. Konovalov, M. Burtsev, “Multilingual case-insensitive named entity recognition”, Advances in Neural Computation, Machine Learning, and Cognitive Research VI, eds. Boris Kryzhanovsky, Witali Dunin-Barkowski, Vladimir Redko, and Yury Tiumentsev, Cham, 2023, 448–454
[8] J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, “BERT: Pre-training of deep bidirectional transformers for language understanding”, Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, v. 1, Long and Short Papers, 2019, 4171–4186 (english)
[9] J. FitzGerald, C. Hench, C. Peris, S. Mackie, K. Rottmann, A. Sanchez, A. Nash, L. Urbach, V. Kakarala, R. Singh, S. Ranganath, L. Crist, M. Britan, W. Leeuwis, G. Tur, P. Natarajan, Massive: A 1m-example multilingual natural language understanding dataset with 51 typologically-diverse languages, 2022
[10] S. Petrov, J. Devlin, Official description of the multilingual bert models from google research, , 2019 https://github.com/google-research/bert/blob/master/multilingual.md
[11] P. Keung, Y. Lu, G. Szarvas, N. A. Smith, “The multilingual amazon reviews corpus”, Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing, 2020
[12] D. P. Kingma, J. Ba, “Adam: A method for stochastic optimization”, Conference Track Proceedings, 3rd International Conference on Learning Representations, ICLR 2015 (San Diego, CA, USA, May 7-9, 2015), eds. Yoshua Bengio and Yann LeCun, 2015
[13] A. Kolesnikova, Y. Kuratov, V. Konovalov, M. Burtsev, Knowledge distillation of russian language models with reduction of vocabulary, 2022
[14] V. Konovalov, R. Artstein, O. Melamud, I. Dagan, “The negochat corpus of human-agent negotiation dialogues”, Proceedings of the Tenth International Conference on Language Resources and Evaluation, LREC'16 (Portorož, Slovenia, May 2016), European Language Resources Association (ELRA), 3141–3145
[15] V. Konovalov, P. Gulyaev, A. Sorokin, Y. Kuratov, M. Burtsev, “Exploring the bert cross-lingual transfer for reading comprehension”, Komp'juternaja Lingvistika i Intellektual'nye Tehnologiithis, 2020, 445–453
[16] V. Konovalov, O. Melamud, R. Artstein, I. Dagan, “Collecting Better Training Data using Biased Agent Policies in Negotiation Dialogues”, Proceedings of WOCHAT, the Second Workshop on Chatbots and Conversational Agent Technologies (Los Angeles, Zerotype, September 2016)
[17] V. P. Konovalov, Z. B. Tumunbayarova, “Learning word embeddings for low resource languages: the case of buryat”, Komp'juternaja Lingvistika i Intellektual'nye Tehnologii, 2018, 331–341
[18] I. Koziev, Chatbot-ru: Russian intent and topic classification dataset, , 2020 https://github.com/Koziev/chatbot/blob/master/data/intents.txt
[19] Y. M. Kuratov, I. F. Yusupov, D. R. Baymurzina, D. P. Kuznetsov, D. V. Cherniavskii, A. Dmitrievskiy, E. S. Ermakova, F. S. Ignatov, D. A. Karpov, D. A. Kornev, T. A. Le, P. Y. Pugin, M. S. Burtsev, “Socialbot dream in alexa prize challenge”, Proceedings of Moscow Institute of Physics and Technology, 13:3 (2019), 62–89
[20] Y. Kuratov, M. Y. Arkhipov, Adaptation of deep bidirectional multilingual transformers for russian language, 2019, arXiv: 1905.07213
[21] Y. Kuratov, I. Yusupov, D. Baymurzina, D. Kuznetsov, D. Cherniavskii, A. Dmitrievskiy, E. Ermakova, F. Ignatov, D. Karpov, D. Kornev, and Others, “Dream technical report for the alexa prize”, 3rd Proceedings of Alexa Prize, 2019
[22] J. Lehmann, R. Isele, M. Jakob, A. Jentzsch, D. Kontokostas, P. Mendes, S. Hellmann, M. Morsey, P. Van Kleef, S. Auer, C. Bizer, “Dbpedia – a large-scale, multilingual knowledge base extracted from wikipedia”, Semantic Web Journal, 6 (2014)
[23] Y. Liang, N. Duan, Y. Gong, N. Wu, F. Guo, W. Qi, M. Gong, L. Shou, D. Jiang, G. Cao, X. Fan, R. Zhang, R. Agrawal, E. Cui, S. Wei, T. Bharti, Y. Qiao, J.-H. Chen, W. Wu, S. Liu, F. Yang, D. Campos, R. Majumder, and M. Zhou, XGLUE: A new benchmark dataset for cross-lingual pre-training, understanding and generation, 2020, arXiv: 2004.01401
[24] J. Niklaus, V. Matoshi, P. Rani, A. Galassi, M. Stürmer, I. Chalkidis, Lextreme: A multi-lingual and multi-task benchmark for the legal domain, 2023
[25] A. Perevalov, Pstu dataset: classification of university-related topics, , 2018 https://github.com/Perevalov/pstu_assistant/blob/master/data/data.txt
[26] B. Sagyndyk, D. Baymurzina, M. Burtsev, “DeepPavlov topics: Topic classification dataset for conversational domain in English”, Advances in Neural Computation, Machine Learning, and Cognitive Research VI, eds. Boris Kryzhanovsky, Witali Dunin-Barkowski, Vladimir Redko, and Yury Tiumentsev, Springer International Publishing, Cham, 2023, 371–380
[27] SberDevices, rut5, ruroberta, rubert: How we trained a series of models for the russian-language, , 2021 (Accessed: 2023-02-17); HuggingFace model link: https://habr.com/ru/company/sberbank/blog/567776/https://huggingface.co/sberbank-ai/ruBert-base
[28] T. Scialom, P.-A. Dray, S. Lamprier, B. Piwowarski, and J. Staiano, Mlsum: The multilingual summarization corpus, 2020, arXiv: 2004.14900
[29] E. Stamatatos, “On the robustness of authorship attribution based on character n-gram features”, Journal of Law and Policy, 21 (2013), 421–439
[30] M. Suzgun, L. Melas-Kyriazi, S. K. Sarkar, S. D. Kominers, S. M. Shieber, The harvard uspto patent dataset: A large-scale, well-structured, and multi-purpose corpus of patent applications, 2022, arXiv: 2207.04043
[31] T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue, A. Moi, P. Cistac, T. Rault, R. Louf, M. Funtowicz, J. Davison, S. Shleifer, P. von Platen, C. Ma, Y. Jernite, J. Plu, C. Xu, Teven Le Scao, S. Gugger, M. Drame, Q. Lhoest, A. M. Rush, “Transformers: State-of-the-art natural language processing”, Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations (Online, October 2020), Association for Computational Linguistics, 38–45
[32] X. Zhang, J. J. Zhao, Y. LeCun, Character-level convolutional networks for text classification, NIPS, 2015
[33] L. Zhou, J. Gao, D. Li, H.-Y. Shum, The design and implementation of xiaoice, an empathetic social chatbot, 2018