@article{ZNSL_2023_529_a7,
author = {V. Moskvoretskii and A. Frolov and D. Kuznetsov},
title = {IMAD: {IMage-Augmented} multi-modal dialogue},
journal = {Zapiski Nauchnykh Seminarov POMI},
pages = {102--122},
year = {2023},
volume = {529},
language = {en},
url = {http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a7/}
}
V. Moskvoretskii; A. Frolov; D. Kuznetsov. IMAD: IMage-Augmented multi-modal dialogue. Zapiski Nauchnykh Seminarov POMI, Investigations on applied mathematics and informatics. Part II–1, Tome 529 (2023), pp. 102-122. http://geodesic.mathdoc.fr/item/ZNSL_2023_529_a7/
[1] J.-B. Alayrac, J. Donahue, P. Luc, A. Miech, I. Barr, Y. Hasson, K. Lenc, A. Mensch, K. Millican, M. Reynolds, R. Ring, E. Rutherford, S. Cabi, T. Han, Z. Gong, S. Samangooei, M. Monteiro, J. Menick, S. Borgeaud, A. Brock, A. Nematzadeh, S. Sharifzadeh, M. Binkowski, R. Barreira, O. Vinyals, A. Zisserman, K. Simonyan, Flamingo: a visual language model for few-shot learning, 2022
[2] N. Anantrasirichai, D. Bull, “Artificial intelligence in the creative industries: a review”, Artificial Intelligence Review, 55:1 (2021), 589–656
[3] T. Baltrušaitis, C. Ahuja, L.-P. Morency, Multimodal machine learning: A survey and taxonomy, 2017
[4] M. S. Burtsev, A. V. Seliverstov, R. Airapetyan, M. Arkhipov, D. Baymurzina, N. Bushkov, O. Gureenkova, T. Khakhulin, Y. Kuratov, D. Kuznetsov, et al., “Deeppavlov: Open-source library for dialogue systems”, Proceedings of ACL 2018, v. 4, 2018, 122–127
[5] S. Changpinyo, P. Sharma, N. Ding, R. Soricut, Conceptual 12m: Pushing web-scale image-text pre-training to recognize long-tail visual concepts, 2021
[6] H. Chen, X. Liu, D. Yin, J. Tang, “A survey on dialogue systems”, ACM SIGKDD Explorations Newsletter, 19:2 (2017), 25–35
[7] J. Chen, C. Wang, K. Wang, C. Yin, C. Zhao, T. Xu, X. Zhang, Z. Huang, M. Liu, T. Yang, “HEU emotion: a large-scale database for multimodal emotion recognition in the wild”, Neural Computing and Applications, 33:14 (2021), 8669–8685
[8] Y. Chen, Xu-Hua Yang, Z. Wei, A. A. Heidari, N. Zheng, Z. Li, H. Chen, H. Hu, Q. Zhou, Q. Guan, “Generative adversarial networks in medical image augmentation: a review”, Computers in Biology and Medicine, 2022, 105382
[9] L. Cui, Y. Wu, S. Liu, Y. Zhang, M. Zhou, “MuTual: A dataset for multi-turn dialogue reasoning”, Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (Online, July 2020), Association for Computational Linguistics, 1406–1416
[10] D. Dellermann, A. Calma, N. Lipusch, T. Weber, S. Weigel, P. Ebel, The future of human-ai collaboration: a taxonomy of design knowledge for hybrid intelligence systems, 2021
[11] J. Devlin, M.-W. Chang, K. Lee, K. Toutanova, Bert: Pre-training of deep bidirectional transformers for language understanding, 2018
[12] A. Dosovitskiy, L. Beyer, A. Kolesnikov, D. Weissenborn, X. Zhai, T. Unterthiner, M. Dehghani, M. Minderer, G. Heigold, S. Gelly, J. Uszkoreit, N. Houlsby, An image is worth 16x16 words: Transformers for image recognition at scale, 2020
[13] J. Gao, P. Li, Z. Chen, J. Zhang, “A Survey on Deep Learning for Multimodal Data Fusion”, Neural Computation, 32:5 (2020), 829–864
[14] X. Gu, K. M. Yoo, J.-W. Ha, Dialogbert: Discourse-aware response generation via learning to recover and rank utterances, 2020
[15] D. Hendrycks, K. Gimpel, Gaussian error linear units (gelus), 2016
[16] X. Hu, Zhe Gan, J. Wang, Z. Yang, Z. Liu, Y. Lu, L. Wang, Scaling up vision-language pre-training for image captioning, 2021
[17] S. Huang, L. Dong, W. Wang, Y. Hao, S. Singhal, S. Ma, T. Lv, L. Cui, O. K. Mohammed, B. Patra, Q. Liu, K. Aggarwal, Z. Chi, J. Bjorck, V. Chaudhary, S. Som, X. Song, F. Wei, Language is not all you need: Aligning perception with language models, 2023
[18] C. Jia, Y. Yang, Ye Xia, Yi-Ting Chen, Z. Parekh, H. Pham, Q. V. Le, Y. Sung, Z. Li, T. Duerig, Scaling up visual and vision-language representation learning with noisy text supervision, 2021
[19] G. Joshi, R. Walambe, K. Kotecha, “A review on explainability in multimodal deep neural nets”, IEEE Access, 9 (2021), 59800–59821
[20] A. B. Kocaballi, Conversational ai-powered design: Chatgpt as designer, user, and product, 2023
[21] R. Krishna, Y. Zhu, O. Groth, J. Johnson, K. Hata, J. Kravitz, S. Chen, Y. Kalantidis, Li-Jia Li, D. A. Shamma, M. S. Bernstein, F.-F. Li, Visual genome: Connecting language and vision using crowdsourced dense image annotations, 2016
[22] A. Kumar, The illustrated image captioning using transformers, ankur3107.github.io, 2022
[23] N. Lee, S. Shin, J. Choo, H.-J. Choi, S.-H. Myaeng, Constructing multi-modal dialogue dataset by replacing text with semantically relevant images, 2021
[24] Y.-J. Lee, B. Ko, H.-G. Kim, H.-J. Choi, Dialogcc: Large-scale multi-modal dialogue dataset, 2022
[25] J. Lei, T. L. Berg, M. Bansal, Revealing single frame bias for video-and-language learning, 2022
[26] C. Li, H. Xu, J. Tian, W. Wang, M. Yan, B. Bi, J. Ye, H. Chen, G. Xu, Z. Cao, J. Zhang, S. Huang, F. Huang, J. Zhou, L. Si, mplug: Effective and efficient vision-language learning by cross-modal skip-connections, 2022
[27] D. Li, J. Li, H. Le, G. Wang, S. Savarese, S. C. H. Hoi, Lavis: A library for language-vision intelligence, 2022
[28] J. Li, D. Li, S. Savarese, S. Hoi, Blip-2: Bootstrapping language-image pre-training with frozen image encoders and large language models, 2023
[29] J. Li, D. Li, C. Xiong, S. Hoi, Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation, 2022
[30] J. Li, D. Li, C. Xiong, S. Hoi, “Blip: Bootstrapping language-image pre-training for unified vision-language understanding and generation”, ICML 2022
[31] J. Li, R. R. Selvaraju, A. D. Gotmare, S. Joty, C. Xiong, S. Hoi, Align before fuse: Vision and language representation learning with momentum distillation, 2021
[32] K. Li, Y. Zhang, K. Li, Y. Li, Y. Fu, “Visual semantic reasoning for image-text matching”, ICCV 2019
[33] Y. Li, H. Su, X. Shen, W. Li, Z. Cao, S. Niu, Dailydialog: A manually labelled multi-turn dialogue dataset, 2017
[34] T.-Y. Lin, M. Maire, S. Belongie, L. Bourdev, R. Girshick, J. Hays, P. Perona, D. Ramanan, C. L. Zitnick, P. Dollár, Microsoft coco: Common objects in context, 2014
[35] B. D. Lund, T. Wang, Chatting about chatgpt: how may ai and gpt impact academia and libraries?, Library Hi Tech News, 2023
[36] O. M. Manyar, J. Cheng, R. Levine, V. Krishnan, J. Barbič, S. K. Gupta, “Physics informed synthetic image generation for deep learning-based detection of wrinkles and folds”, J. Computing and Information Science in Engineering, 23:3 (2023), 030903
[37] Q. Motger, X. Franch, J. Marco, “Software-based dialogue systems: Survey, taxonomy, and challenges”, ACM Comput. Surv., 55:5 (2022)
[38] A. van den Oord, Y. Li, O. Vinyals, Representation learning with contrastive predictive coding, 2018
[39] ChatGPT: Optimizing language models for dialogue, OpenAI, 2022
[40] V. Ordonez, G. Kulkarni, T. Berg, “Im2text: Describing images using 1 million captioned photographs”, Advances in Neural Information Processing Systems, 24, eds. J. Shawe-Taylor, R. Zemel, P. Bartlett, F. Pereira, and K.Q. Weinberge, Curran Associates, Inc., 2011
[41] F. Pedregosa, G. Varoquaux, A. Gramfort, V. Michel, B. Thirion, O. Grisel, M. Blondel, P. Prettenhofer, R. Weiss, V. Dubourg, J. Vanderplas, A. Passos, D. Cournapeau, M. Brucher, M. Perrot, and E. Duchesnay, “Scikit-learn: Machine learning in Python”, J. Machine Learning Research, 12 (2011), 2825–2830
[42] M. Post, “A call for clarity in reporting BLEU scores”, Proceedings of the Third Conference on Machine Translation, Research Papers (Belgium, Brussels, October 2018), Association for Computational Linguistics, 186–191
[43] A. Radford, J. W. Kim, C. Hallacy, A. Ramesh, G. Goh, S. Agarwal, G. Sastry, A. Askell, P. Mishkin, J. Clark, G. Krueger, I. Sutskever, Learning transferable visual models from natural language supervision, 2021
[44] A. Ramesh, M. Pavlov, G. Goh, S. Gray, C. Voss, A. Radford, M. Chen, I. Sutskever, Zero-shot text-to-image generation, 2021
[45] H. Rashkin, E. M. Smith, M. Li, Y.-Lan Boureau, Towards empathetic open-domain conversation models: a new benchmark and dataset, 2018
[46] S. Reed, K. Zolna, E. Parisotto, S. G. Colmenarejo, A. Novikov, G. Barthmaron, M. Giménez, Y. Sulsky, J. Kay, J. T. Springenberg, T. Eccles, J. Bruce, A. Razavi, A. Edwards, N. Heess, Y. Chen, R. Hadsell, O. Vinyals, M. Bordbar, N. de Freitas, “A generalist agent”, Transactions on Machine Learning Research, 2022, Featured Certification
[47] N. Reimers, I. Gurevych, “Sentence-bert: Sentence embeddings using siamese bert-networks”, Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing, Association for Computational Linguistics, 2019
[48] A. J. C. Rodriguez, D. C. Castro, S. H. Garcia, “Noun-based attention mechanism for fine-grained named entity recognition”, Expert Systems with Applications, 193 (2022), 116406
[49] S. Roller, E. Dinan, N. Goyal, Da Ju, M. Williamson, Y. Liu, J. Xu, M. Ott, K. Shuster, E. M. Smith, Y-Lan Boureau, J. Weston, Recipes for building an open-domain chatbot, 2020
[50] R. Rombach, A. Blattmann, D. Lorenz, P. Esser, B. Ommer, High-resolution image synthesis with latent diffusion models, 2021
[51] C. Saharia, W. Chan, S. Saxena, L. Li, J. Whang, E. Denton, S. K. S. Ghasemipour, B. K. Ayan, S. S. Mahdavi, R. G.Lopes, T. Salimans, J. Ho, D. J. Fleet, M. Norouzi, Photorealistic text-to-image diffusion models with deep language understanding, 2022
[52] C. Schuhmann, R. Vencu, R. Beaumont, R. Kaczmarczyk, C. Mullis, A. Katta, T. Coombes, J. Jitsev, A. Komatsuzaki, Laion-400m: Open dataset of clip-filtered 400 million image-text pairs, 2021
[53] S. Shkarin, Ruts, a library for statistics extraction from texts in russian, M., 2023
[54] J. Summaira, Xi Li, A. M. Shoib, S. Li, J. Abdul, Recent advances and trends in multimodal deep learning: A review, 2021
[55] K. Sun, D. Yu, J. Chen, D. Yu, Y. Choi, C. Cardie, Dream: A challenge dataset and models for dialogue-based reading comprehension, 2019
[56] Xu Tan, Jiawei Chen, Haohe Liu, Jian Cong, Chen Zhang, Yanqing Liu, Xi Wang, Yichong Leng, Yuanhao Yi, Lei He, Frank Soong, Tao Qin, Sheng Zhao, and Tie-Yan Liu, Naturalspeech: End-to-end text to speech synthesis with human-level quality, 2022
[57] R. Thoppilan, D. De Freitas, J. Hall, N. Shazeer, A. Kulshreshtha, H.-T. Cheng, A. Jin, T. Bos, L. Baker, Yu Du, YaGuang Li, H. Lee, H. S. Zheng, A. Ghafouri, M. Menegali, Y. Huang, M. Krikun, D. Lepikhin, J. Qin, D. Chen, Y. Xu, Z. Chen, A. Roberts, M. Bosma, V. Zhao, Y. Zhou, Ch.-Ch. Chang, I. Krivokon, W. Rusch, M. Pickett, P. Srinivasan, L. Man, K. Meier-Hellstern, M. R. Morris, T. Doshi, R. D. Santos, T. Duke, J. Soraker, B. Zevenbergen, V. Prabhakaran, M. Diaz, B. Hutchinson, K. Olson, A. Molina, E. Hoffman-John, J. Lee, L. Aroyo, R. Rajakumar, A. Butryna, M. Lamm, V. Kuzmina, J. Fenton, A. Cohen, R. Bernstein, R. Kurzweil, B. Aguera-Arcas, C. Cui, M. Croak, E. Chi, Q. Le, Lamda: Language models for dialog applications, 2022
[58] R. Valle, K. Shih, R. Prenger, B. Catanzaro, Flowtron: an autoregressive flow-based generative network for text-to-speech synthesis, 2020
[59] P. Wang, A. Yang, R. Men, J. Lin, S. Bai, Z. Li, J. Ma, C. Zhou, J. Zhou, H. Yang, Ofa: Unifying architectures, tasks, and modalities through a simple sequence-to-sequence learning framework, 2022
[60] Z. Wang, J. Yu, A. W. Yu, Z. Dai, Y. Tsvetkov, Y. Cao, Simvlm: Simple visual language model pretraining with weak supervision, 2021
[61] Teven Le Scao, Angela Fan, Christopher Akiki, Ellie Pavlick, Suzana Ilić, Daniel Hesslow, Roman Castagné, Alexandra Sasha Luccioni, and et. Al Yvon, “Bloom: A 176b-parameter open-access multilingual language model”, BigScience Workshop, 2022
[62] Y. Xi, Y. Zhang, S. Ding, S. Wan, “Visual question answering model based on visual relationship detection”, Signal Processing: Image Communication, 80 (2020), 115648
[63] X. Xu, C. Wu, S. Rosenman, V. Lal, W. Che, N. Duan, Bridgetower: Building bridges between encoders in vision-language representation learning, 2022
[64] S. Yan, T. Zhu, Z. Wang, Y. Cao, M. Zhang, S. Ghosh, Y. Wu, J. Yu, Videococa: Video-text modeling with zero-shot transfer from contrastive captioners, 2022
[65] A. Yang, A. Miech, J. Sivic, I. Laptev, C. Schmid, Zero-shot video question answering via frozen bidirectional language models, 2022
[66] P. Young, A. Lai, M. Hodosh, J. Hockenmaier, “From image descriptions to visual denotations: New similarity metrics for semantic inference over event descriptions”, Transactions of the Association for Computational Linguistics, 2 (2014), 67–78
[67] S. Zhang, E. Dinan, J. Urbanek, A. Szlam, D. Kiela, J. Weston, Personalizing dialogue agents: I have a dog, do you have pets too?, 2018
[68] Y. Zhang, S. Sun, M. Galley, Y.-C. Chen, C. Brockett, X. Gao, J. Gao, JJ (Jingjing) Liu, B. Dolan, Dialogpt: Large-scale generative pre-training for conversational response generation, November 2019, arXiv: 1911.00536
[69] Y. Zheng, G. Chen, X. Liu, J. Sun, Mmchat: Multi-modal chat dataset on social media, 2021
[70] P. Zhou, K. Gopalakrishnan, B. Hedayatnia, S. Kim, J. Pujara, X. Ren, Y. Liu, D. Hakkani-Tur, Commonsense-focused dialogues for response generation: An empirical study, 2021