Voir la notice de l'article provenant de la source Library of Science
@article{IJAMCS_2024_34_3_a8, author = {Yang, Guan and Ji, Cheng and Liu, Xiaoming and Zhang, Ziming and Wang, Chen}, title = {DCF-VQA: {Counterfactual} structure based on multi-feature enhancement}, journal = {International Journal of Applied Mathematics and Computer Science}, pages = {453--466}, publisher = {mathdoc}, volume = {34}, number = {3}, year = {2024}, language = {en}, url = {http://geodesic.mathdoc.fr/item/IJAMCS_2024_34_3_a8/} }
TY - JOUR AU - Yang, Guan AU - Ji, Cheng AU - Liu, Xiaoming AU - Zhang, Ziming AU - Wang, Chen TI - DCF-VQA: Counterfactual structure based on multi-feature enhancement JO - International Journal of Applied Mathematics and Computer Science PY - 2024 SP - 453 EP - 466 VL - 34 IS - 3 PB - mathdoc UR - http://geodesic.mathdoc.fr/item/IJAMCS_2024_34_3_a8/ LA - en ID - IJAMCS_2024_34_3_a8 ER -
%0 Journal Article %A Yang, Guan %A Ji, Cheng %A Liu, Xiaoming %A Zhang, Ziming %A Wang, Chen %T DCF-VQA: Counterfactual structure based on multi-feature enhancement %J International Journal of Applied Mathematics and Computer Science %D 2024 %P 453-466 %V 34 %N 3 %I mathdoc %U http://geodesic.mathdoc.fr/item/IJAMCS_2024_34_3_a8/ %G en %F IJAMCS_2024_34_3_a8
Yang, Guan; Ji, Cheng; Liu, Xiaoming; Zhang, Ziming; Wang, Chen. DCF-VQA: Counterfactual structure based on multi-feature enhancement. International Journal of Applied Mathematics and Computer Science, Tome 34 (2024) no. 3, pp. 453-466. http://geodesic.mathdoc.fr/item/IJAMCS_2024_34_3_a8/
[1] Abbasnejad, E., Teney, D., Parvaneh, A., Shi, J. and van den Hengel, A. (2020). Counterfactual vision and language learning, Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, Seattle, USA, pp. 10044-10054.
[2] Agrawal, A., Batra, D., Parikh, D. and Kembhavi, A. (2018). Don’t just assume; look and answer: Overcoming priors for visual question answering, Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Salt Lake City, USA, pp. 4971-4980.
[3] AlFawwaz, B.M., AL-Shatnawi, A., Al-Saqqar, F. and Nusir, M. (2022). Multi-resolution discrete cosine transform fusion technique face recognition model, Data 7(6): 80.
[4] Anderson, P., He, X., Buehler, C., Teney, D., Johnson, M., Gould, S. and Zhang, L. (2018). Bottom-up and top-down attention for image captioning and visual question answering, Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Salt Lake City, USA, pp. 6077-6086.
[5] Antol, S., Agrawal, A., Lu, J., Mitchell, M., Batra, D., Zitnick, C.L. and Parikh, D. (2015). VQA: Visual question answering, Proceedings of the IEEE International Conference on Computer Vision, Santiago, Chile, pp. 2425-2433.
[6] Cadene, R., Dancette, C., Benyounes, H., Cord, M., Parikh, D. (2019). Rubi: Reducing unimodal biases for visual question answering, Advances in Neural Information Processing Systems 32: 3197-3208.
[7] Chen, L., Zheng, Y., Niu, Y., Zhang, H. and Xiao, J. (2023). Counterfactual samples synthesizing and training for robust visual question answering, IEEE Transactions on Pattern Analysis and Machine Intelligence 45(11): 13218-13234.
[8] Clark, C., Yatskar, M. and Zettlemoyer, L. (2019). Don’t take the easy way out: Ensemble based methods for avoiding known dataset biases, arXiv: 1909.03683.
[9] Das, A., Agrawal, H., Zitnick, L., Parikh, D. and Batra, D. (2017a). Human attention in visual question answering: Do humans and deep networks look at the same regions?, Computer Vision and Image Understanding 163: 90-100.
[10] Das, A., Kottur, S., Gupta, K., Singh, A., Yadav, D., Moura, J.M., Parikh, D. and Batra, D. (2017b). Visual dialog, Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, USA, pp. 326-335.
[11] Gat, I., Schwartz, I., Schwing, A. and Hazan, T. (2020). Removing bias in multi-modal classifiers: Regularization by maximizing functional entropies, Advances in Neural Information Processing Systems 33: 3197-3208.
[12] Gokhale, T., Banerjee, P., Baral, C. and Yang, Y. (2020). Mutant: A training paradigm for out-of-distribution generalization in visual question answering, arXiv: 2009.08566.
[13] Goyal, Y., Khot, T., Summers-Stay, D., Batra, D. and Parikh, D. (2017). Making the V in VQA matter: Elevating the role of image understanding in visual question answering, Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Honolulu, USA, pp. 6904-6913.
[14] Guo, Y., Nie, L., Cheng, Z., Ji, F., Zhang, J. and Del Bimbo, A. (2021). ADAVQA: Overcoming language priors with adapted margin cosine loss, arXiv: 2105.01993.
[15] Hashemi, M., Mahmoudi, G., Kodeiri, S., Sheikhi, H. and Eetemadi, S. (2023). LXMERT model compression for visual question answering, arXiv: 2310.15325.
[16] Kafle, K. and Kanan, C. (2017a). An analysis of visual question answering algorithms, Proceedings of the IEEE International Conference on Computer Vision, Venice, Italy, pp. 1965-1973.
[17] Kafle, K. and Kanan, C. (2017b). Visual question answering: Datasets, algorithms, and future challenges, Computer Vision and Image Understanding 163: 3-20.
[18] Kingma, D.P. and Ba, J. (2014). Adam: A method for stochastic optimization, arXiv: 1412.6980.
[19] Kv, G. and Mittal, A. (2020). Reducing language biases in visual question answering with visually-grounded question encoder, Computer Vision-ECCV 2020: 16th European Conference, Glasgow, UK, pp. 18-34.
[20] Li, L., Gan, Z., Cheng, Y. and Liu, J. (2019). Relation-aware graph attention network for visual question answering, Proceedings of the IEEE/CVF International Conference on Computer Vision, Seoul, Korea, pp. 10313-10322.
[21] Liang, Z., Jiang, W., Hu, H. and Zhu, J. (2020). Learning to contrast the counterfactual samples for robust visual question answering, Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP), pp. 3285-3292, (online).
[22] Metwaly, M.K., Elkalashy, N.I. and Zaky, M.S. (2017). Discrete sine and cosine transforms for signal processing spectral overlap saliencies of induction machine, IEEE Transactions on Industrial Electronics 65(1): 189-199.
[23] Niu, Y., Tang, K., Zhang, H., Lu, Z., Hua, X.-S. and Wen, J.-R. (2021). Counterfactual VQA: A cause-effect look at language bias, Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, Nashville, USA, pp. 12700-12710.
[24] Park, D.H., Hendricks, L.A., Akata, Z., Rohrbach, A., Schiele, B., Darrell, T. and Rohrbach, M. (2018). Multimodal explanations: Justifying decisions and pointing to the evidence, Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Salt Lake City, USA, pp. 8779-8788.
[25] Ramakrishnan, S., Agrawal, A. and Lee, S. (2018). Overcoming language priors in visual question answering with adversarial regularization, Proceedings of the 32nd International Conference on Neural Information Processing Systems, NIPS’18, Montreal, Canada, pp. 1548-1558.
[26] Selvaraju, R.R., Lee, S., Shen, Y., Jin, H., Ghosh, S., Heck, L., Batra, D. and Parikh, D. (2019). Taking a hint: Leveraging explanations to make vision and language models more grounded, Proceedings of the IEEE/CVF International Conference on Computer Vision, Seoul, Korea, pp. 2591-2600.
[27] Shrestha, R., Kafle, K. and Kanan, C. (2020). A negative case analysis of visual grounding methods for VQA, arXiv: 2004.05704.
[28] Si, Q., Lin, Z., Zheng, M., Fu, P. and Wang, W. (2021). Check it again: Progressive visual question answering via visual entailment, arXiv: 2106.04605.
[29] Surówka, G. and Ogorzałek, M. (2022). Segmentation of the melanoma lesion and its border, International Journal of Applied Mathematics and Computer Science 32(4): 683-699, DOI: 10.34768/amcs-2022-0047.
[30] Teney, D., Abbasnedjad, E. and van den Hengel, A. (2020). Learning what makes a difference from counterfactual examples and gradient supervision, Computer Vision - ECCV 2020: 16th European Conference, Glasgow, UK, pp. 580-599.
[31] Teney, D. and van den Hengel, A. (2019). Actively seeking and learning from live data, Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern, Recognition, Long Beach, USA.
[32] Wu, J. and Mooney, R. (2019). Self-critical reasoning for robust visual question answering, 33rd Conference on Neural Information Processing Systems (NeurIPS 2019), Vancouver, Canada, pp. 8604-8610.
[33] Yang, C., Feng, S., Li, D., Shen, H., Wang, G. and Jiang, B. (2021). Learning content and context with language bias for visual question answering, 2021 IEEE International Conference on Multimedia and Expo (ICME), Shenzhen, China, pp. 1-6.
[34] Yang, L., Xie, T., Liu, M., Zhang, M., Qi, S. and Yang, J. (2023a). Infrared small-target detection under a complex background based on a local gradient contrast method, International Journal of Applied Mathematics and Computer Science 33(1): 33-43, DOI: 10.34768/amcs-2023-0003.
[35] Yang, P., Wang, Q., Chen, H. and Wu, Z. (2023b). Position-aware spatio-temporal graph convolutional networks for skeleton-based action recognition, IET Computer Vision 17(7): 844-854.
[36] Yang, Z., He, X., Gao, J., Deng, L. and Smola, A. (2016). Stacked attention networks for image question answering, Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition, Las Vegas, USA, pp. 21-29.
[37] Zellers, R., Bisk, Y., Farhadi, A. and Choi, Y. (2019). From recognition to cognition: Visual commonsense reasoning, Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition, Long Beach, USA, pp. 6720-6731.
[38] Zhang, L., Liu, S., Liu, D., Zeng, P., Li, X., Song, J. and Gao, L. (2020). Rich visual knowledge-based augmentation network for visual question answering, IEEE Transactions on Neural Networks and Learning Systems 32(10): 4362-4373.
[39] Zhu, X., Mao, Z., Liu, C., Zhang, P., Wang, B. and Zhang, Y. (2020). Overcoming language priors with self-supervised learning for visual question answering, arXiv: 2012.11528.