@article{ZNSL_2024_540_a12,
author = {D. Shaikhelislamov and M. Drobyshevskiy and A. Belevancev},
title = {Ensuring trustworthy code: leveraging a static analyzer to identify and mitigate defects in generated code},
journal = {Zapiski Nauchnykh Seminarov POMI},
pages = {233--251},
year = {2024},
volume = {540},
language = {en},
url = {http://geodesic.mathdoc.fr/item/ZNSL_2024_540_a12/}
}
TY - JOUR AU - D. Shaikhelislamov AU - M. Drobyshevskiy AU - A. Belevancev TI - Ensuring trustworthy code: leveraging a static analyzer to identify and mitigate defects in generated code JO - Zapiski Nauchnykh Seminarov POMI PY - 2024 SP - 233 EP - 251 VL - 540 UR - http://geodesic.mathdoc.fr/item/ZNSL_2024_540_a12/ LA - en ID - ZNSL_2024_540_a12 ER -
%0 Journal Article %A D. Shaikhelislamov %A M. Drobyshevskiy %A A. Belevancev %T Ensuring trustworthy code: leveraging a static analyzer to identify and mitigate defects in generated code %J Zapiski Nauchnykh Seminarov POMI %D 2024 %P 233-251 %V 540 %U http://geodesic.mathdoc.fr/item/ZNSL_2024_540_a12/ %G en %F ZNSL_2024_540_a12
D. Shaikhelislamov; M. Drobyshevskiy; A. Belevancev. Ensuring trustworthy code: leveraging a static analyzer to identify and mitigate defects in generated code. Zapiski Nauchnykh Seminarov POMI, Investigations on applied mathematics and informatics. Part IV, Tome 540 (2024), pp. 233-251. http://geodesic.mathdoc.fr/item/ZNSL_2024_540_a12/
[1] A. Svyatkovskiy, S.K. Deng, S. Fu, and N. Sundaresan, “IntelliCode compose: Code generation using Transformer”, Proc. 28th ACM Joint Meeting on European Software Engineering Conference and Symposium on the Foundations of Software Engineering, 2020, 1433–1443 | DOI
[2] S. Gulwani, O. Polozov, and R. Singh, “Program synthesis”, Foundations and Trends in Programming Languages, 4:1–2 (2017), 1–119 | DOI
[3] D. Abulkhanov, N. Sorokin, S. Nikolenko, and V. Malykh, “Lapca: Language-agnostic pretraining with cross-lingual alignment”, Proc. 46th International ACM SIGIR Conference on Research and Development in Information Retrieval, 2023, 2098–2102
[4] H. Touvron, L. Martin, K. Stone, P. Albert, A. Almahairi, Y. Babaei, N. Bashlykov, S. Batra, P. Bhargava, S. Bhosale, M. Clement, R. Das, et al., Llama 2: Open foundation and fine-tuned chat models, 2023, arXiv: 2307.09288
[5] A. Razzhigaev, M. Salnikov, V. Malykh, P. Braslavski, and A. Panchenko, “A system for answering simple questions in multiple languages”, Proc. 61st Annual Meeting of the Association for Computational Linguistics, v. 3, 2023, 524–537
[6] B. Roziere, J. Gehring, F. Gloeckle, S. Sootla, I. Gat, X.E. Tan, Y. Adi, J. Liu, T. Remez, J. Rapin, S. Sinha, et al., Code llama: Open foundation models for code, 2023, arXiv: 2308.12950
[7] D.M. Ziegler, N. Stiennon, J. Wu, T.B. Brown, A. Radford, D. Amodei, P. Christiano, and G. Irving, Fine-tuning language models from human preferences, 2019, arXiv: 1909.08593
[8] F. Christopoulou, G. Lampouras, M. Gritta, G. Zhang, Y. Guo, Z. Li, Q. Zhang, M. Xiao, B. Shen, L. Li, et al., Pangu-coder: Program synthesis with function-level language modeling, 2022, arXiv: 2207.11280
[9] D. Hendrycks, S. Basart, S. Kadavath, M. Mazeika, A. Arora, E. Guo, C. Burns, S. Puranik, H. He, D. Song, et al., Measuring coding challenge competence with apps, 2021, arXiv: 2105.09938
[10] Y. Li, D. Choi, J. Chung, N. Kushman, J. Schrittwieser, R. Leblond, T. Eccles, J. Keeling, F. Gimeno, A. Dal Lago, M. Lewis, et al., “Competition-level code generation with alphacode”, Science, 378:6624 (2022), 1092–1097 | DOI
[11] X. Wang, Y. Wang, Y. Wan, F. Mi, Y. Li, P. Zhou, J. Liu, H. Wu, X. Jiang, and Q. Liu, Compilable neural code generation with compiler feedback, 2022, arXiv: 2203.05132
[12] S. Dou, Y. Liu, H. Jia, L. Xiong, E. Zhou, J. Shan, C. Huang, W. Shen, X. Fan, Z. Xi, et al., StepCoder: Improve Code Generation with Reinforcement Learning from Compiler Feedback, 2024, arXiv: 2402.01391
[13] J. Liu, Y. Zhu, K. Xiao, Q. Fu, X. Han, W. Yang, and D. Ye, Rltf: Reinforcement learning from unit test feedback, 2023, arXiv: 2307.04349
[14] P. Shojaee, A. Jain, S. Tipirneni, and C.K. Reddy, Execution-based code generation using deep reinforcement learning, 2023, arXiv: 2301.13816
[15] J. Hao, T. Yang, H. Tang, C. Bai, J. Liu, Z. Meng, P. Liu, and Z. Wang, “Exploration in deep reinforcement learning: From single-agent to multiagent domain”, IEEE Transactions on Neural Networks and Learning Systems, 2023
[16] P. Ladosz, L. Weng, M. Kim, and H. Oh, “Exploration in deep reinforcement learning: A survey”, Information Fusion, 85 (2022), 1–22 | DOI | MR
[17] B. Blanchet, P. Cousot, R. Cousot, J. Féret, L. Mauborgne, A. Miné, D. Monniaux, and X. Rival, “A static analyzer for large safety-critical software”, Proc. ACM SIGPLAN 2003 Conference on Programming Language Design and Implementation, 2003, 196–207 | DOI
[18] S. Lu, D. Guo, S. Ren, J. Huang, A. Svyatkovskiy, A. Blanco, C. Clement, D. Drain, D. Jiang, D. Tang, et al., Codexglue: A machine learning benchmark dataset for code understanding and generation, 2021, arXiv: 2102.04664
[19] Md R. Parvez, W.U. Ahmad, S. Chakraborty, B. Ray, and K.-W. Chang, Retrieval augmented code generation and summarization, 2021, arXiv: 2108.11601
[20] D. Humphreys, A. Koay, D. Desmond, and E. Mealy, “AI hype as a cyber security risk: The moral responsibility of implementing generative AI in business”, AI and Ethics, 2024, 1–14
[21] D.Y. Turdakov, A.I. Avetisyan, K.V. Arkhipenko, A.V. Antsiferova, D.S. Vatolin, S.S. Volkov, A.V. Gasnikov, D.A. Devyatkin, M.D. Drobyshevsky, A.P. Kovalenko, et al., “Trusted artificial intelligence: Challenges and promising solutions”, Doklady Mathematics, 106 (2022), S9–S13 | DOI
[22] S. Kulal, P. Pasupat, K. Chandra, M. Lee, O. Padon, A. Aiken, and P.S. Liang, “Spoc: Search-based pseudocode to code”, Advances in Neural Information Processing Systems, 32 (2019)
[23] T. Korbak, H. Elsahar, M. Dymetman, and G. Kruszewski, Energy-based models for code generation under compilability constraints, 2021, arXiv: 2106.04985
[24] Y. Pan, “Interactive application security testing”, 2019 International Conference on Smart Grid and Electrical Automation (ICSGEA), 2019, 558–561 | MR
[25] B. Shen, J. Zhang, T. Chen, D. Zan, B. Geng, A. Fu, M. Zeng, A. Yu, J. Ji, J. Zhao, et al., Pangu-coder2: Boosting large language models for code with ranking feedback, 2023, arXiv: 2307.14936
[26] L. Ben Allal, R. Li, D. Kocetkov, C. Mou, C. Akiki, C.M. Ferrandis, N. Muennighoff, M. Mishra, A. Gu, M. Dey, et al., SantaCoder: Don't reach for the stars!, 2023, arXiv: 2301.03988
[27] M. Chen, J. Tworek, H. Jun, Q. Yuan, H.P. Pinto de Oliveira, J. Kaplan, H. Edwards, Y. Burda, N. Joseph, G. Brockman, et al., Evaluating large language models trained on code, 2021, arXiv: 2107.03374
[28] R.J. Williams, “Simple statistical gradient-following algorithms for connectionist reinforcement learning”, Machine Learning, 8 (1992), 229–256 | Zbl
[29] H. Liu, X. Geng, L. Lee, I. Mordatch, S. Levine, S. Narang, and P. Abbeel, Towards better few-shot and finetuning performance with forgetful causal language models, 2022, arXiv: 2210.13432
[30] H. Le, Y. Wang, A.D. Gotmare, S. Savarese, and S.C.H. Hoi, “CodeRL: Mastering code generation through pretrained models and deep reinforcement learning”, Advances in Neural Information Processing Systems, 35 (2022), 21314–21328 | MR
[31] V. Konda and J. Tsitsiklis, “Actor-critic algorithms”, Advances in Neural Information Processing Systems, 12 (1999)
[32] J. Schulman, F. Wolski, P. Dhariwal, A. Radford, and O. Klimov, Proximal policy optimization algorithms, 2017, arXiv: 1707.06347
[33] J. Wei, X. Wang, D. Schuurmans, M. Bosma, F. Xia, E. Chi, Q.V. Le, D. Zhou, et al., “Chain-of-thought prompting elicits reasoning in large language models”, Advances in Neural Information Processing Systems, 35 (2022), 24824–24837
[34] X. Hu, K. Kuang, J. Sun, H. Yang, and F. Wu, Leveraging print debugging to improve code generation in large language models, 2024, arXiv: 2401.05319
[35] T. Brown, B. Mann, N. Ryder, M. Subbiah, J.D. Kaplan, P. Dhariwal, A. Neelakantan, P. Shyam, G. Sastry, A. Askell, et al., “Language models are few-shot learners”, Advances in Neural Information Processing Systems, 33 (2020), 1877–1901
[36] T. Kojima, S.S. Gu, M. Reid, Y. Matsuo, and Y. Iwasawa, “Large language models are zero-shot reasoners”, Advances in Neural Information Processing Systems, 35 (2022), 22199–22213
[37] Z. Zhang, A. Zhang, M. Li, and A. Smola, Automatic chain of thought prompting in large language models, 2022, arXiv: 2210.03493
[38] D. Zhou, N. Schärli, L. Hou, J. Wei, N. Scales, X. Wang, D. Schuurmans, C. Cui, O. Bousquet, Q. Le, et al., Least-to-most prompting enables complex reasoning in large language models, 2022, arXiv: 2205.10625
[39] T. Khot, H. Trivedi, M. Finlayson, Y. Fu, K. Richardson, P. Clark, and A. Sabharwal, Decomposed prompting: A modular approach for solving complex tasks, 2022, arXiv: 2210.02406
[40] S. Yao, D. Yu, J. Zhao, I. Shafran, T. Griffiths, Y. Cao, and K. Narasimhan, “Tree of thoughts: Deliberate problem solving with large language models”, Advances in Neural Information Processing Systems, 36 (2024)
[41] A. Madaan, N. Tandon, P. Gupta, S. Hallinan, L. Gao, S. Wiegreffe, U. Alon, N. Dziri, S. Prabhumoye, Y. Yang, et al., “Self-refine: Iterative refinement with self-feedback”, Advances in Neural Information Processing Systems, 36 (2024)
[42] N. Shinn, B. Labash, and A. Gopinath, Reflexion: An autonomous agent with dynamic memory and self-reflection, 2023, arXiv: 2303.11366
[43] D. Zhang, L. Chen, S. Zhang, H. Xu, Z. Zhao, and K. Yu, “Large language models are semi-parametric reinforcement learning agents”, Advances in Neural Information Processing Systems, 36 (2024) | MR
[44] X.-Y. Li, J.-T. Xue, Z. Xie, and M. Li, Think outside the code: Brainstorming boosts large language models in code generation, 2023, arXiv: 2305.10679
[45] J. Li, G. Li, Y. Li, and Z. Jin, Enabling programming thinking in large language models toward code generation, 2023, arXiv: 2305.06599
[46] J. Li, S. Tworkowski, Y. Wu, and R. Mooney, Explaining competitive-level programming solutions using LLMs, 2023, arXiv: 2307.05337
[47] T. Zhang, T. Yu, T. Hashimoto, M. Lewis, W.-t. Yih, D. Fried, and S. Wang, “Coder reviewer reranking for code generation”, International Conference on Machine Learning, 2023, 41832–41846
[48] V.P. Ivannikov, A.A. Belevantsev, A.E. Borodin, V.N. Ignatiev, D.M. Zhurikhin, and A.I. Avetisyan, “Static analyzer Svace for finding defects in a source program code”, Programming and Computer Software, 40 (2014), 265–275 | DOI
[49] A. Belevantsev, A. Borodin, I. Dudina, V. Ignatiev, A. Izbyshev, S. Polyakov, E. Velesevich, and D. Zhurikhin, “Design and development of Svace static analyzers”, 2018 Ivannikov Memorial Workshop (IVMEM), 2018, 3–9
[50] R. Zheng, S. Dou, S. Gao, Y. Hua, W. Shen, B. Wang, Y. Liu, S. Jin, Y. Zhou, L. Xiong, et al., “Delve into PPO: Implementation matters for stable RLHF”, NeurIPS 2023 Workshop on Instruction Tuning and Instruction Following, 2023
[51] M.A. Wiering and M. Van Otterlo, “Reinforcement learning”, Adaptation, Learning, and Optimization, 12:3 (2012), 729
[52] A. Radford, J. Wu, R. Child, D. Luan, D. Amodei, I. Sutskever, and others, “Language models are unsupervised multitask learners”, OpenAI Blog, 1:8 (2019), 9
[53] N. Srivastava, G. Hinton, A. Krizhevsky, I. Sutskever, and R. Salakhutdinov, “Dropout: A simple way to prevent neural networks from overfitting”, The Journal of Machine Learning Research, 15:15 (2014), 1929–1958 | MR | Zbl
[54] M. Felderer, M. Büchler, M. Johns, A.D. Brucker, R. Breu, and A. Pretschner, “Security testing: A survey”, Advances in Computers, 101 (2016), 1–51 | DOI
[55] W. Hou and Z. Ji, A systematic evaluation of large language models for generating programming code, 2024, arXiv: 2403.00894