@article{RM_2024_79_6_a4,
author = {A. E. Sadchikov and S. A. Chezhegov and A. N. Beznosikov and A. V. Gasnikov},
title = {Local {SGD} for near-quadratic problems: {Improving} convergence under unconstrained noise conditions},
journal = {Trudy Matematicheskogo Instituta imeni V.A. Steklova},
pages = {1017--1049},
year = {2024},
volume = {79},
number = {6},
language = {en},
url = {http://geodesic.mathdoc.fr/item/RM_2024_79_6_a4/}
}
TY - JOUR AU - A. E. Sadchikov AU - S. A. Chezhegov AU - A. N. Beznosikov AU - A. V. Gasnikov TI - Local SGD for near-quadratic problems: Improving convergence under unconstrained noise conditions JO - Trudy Matematicheskogo Instituta imeni V.A. Steklova PY - 2024 SP - 1017 EP - 1049 VL - 79 IS - 6 UR - http://geodesic.mathdoc.fr/item/RM_2024_79_6_a4/ LA - en ID - RM_2024_79_6_a4 ER -
%0 Journal Article %A A. E. Sadchikov %A S. A. Chezhegov %A A. N. Beznosikov %A A. V. Gasnikov %T Local SGD for near-quadratic problems: Improving convergence under unconstrained noise conditions %J Trudy Matematicheskogo Instituta imeni V.A. Steklova %D 2024 %P 1017-1049 %V 79 %N 6 %U http://geodesic.mathdoc.fr/item/RM_2024_79_6_a4/ %G en %F RM_2024_79_6_a4
A. E. Sadchikov; S. A. Chezhegov; A. N. Beznosikov; A. V. Gasnikov. Local SGD for near-quadratic problems: Improving convergence under unconstrained noise conditions. Trudy Matematicheskogo Instituta imeni V.A. Steklova, Tome 79 (2024) no. 6, pp. 1017-1049. http://geodesic.mathdoc.fr/item/RM_2024_79_6_a4/
[1] M. Abadi, A. Chu, I. Goodfellow, H. B. McMahan, I. Mironov, K. Talwar, and Li Zhang, “Deep learning with differential privacy”, Proceedings of the 2016 ACM SIGSAC conference on computer and communications security, ACM, New York, 2016, 308–318 | DOI
[2] D. Basu, D. Data, C. Karakus, and S. N. Diggavi, “Qsparse-local-SGD: distributed SGD with quantization, sparsification and local computations”, NIPS'19: Proceedings of the 33rd international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 32, Curran Associates, Inc., Red Hook, NY, 2019, 1316, 14695–14706 https://proceedings.neurips.cc/paper_files/paper/2019/hash/d202ed5bcfa858c15a9f383c3e386ab2-Abstract.html
[3] A. Beznosikov, P. Dvurechenskii, A. Koloskova, V. Samokhin, S. U. Stich, and A. Gasnikov, “Decentralized local stochastic extra-gradient for variational inequalities”, NIPS'22: Proceedings of the 36th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 35, Curran Associates, Inc., Red Hook, NY, 2022, 2762, 38116–38133 https://proceedings.neurips.cc/paper_files/paper/2022/hash/f9379afacdbabfdc6b060972b60f9ab8-Abstract-Conference.html
[4] A. Beznosikov, V. Samokhin, and A. Gasnikov, Distributed saddle-point problems: lower bounds, near-optimal and robust algorithms, 2022 (v1 – 2020), 52 pp., arXiv: 2010.13112
[5] A. Beznosikov, G. Scutari, A. Rogozin, and A. Gasnikov, “Distributed saddle-point problems under similarity”, NIPS'21: Proceedings of the 35th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 34, Curran Associates, Inc., Red Hook, NY, 2021, 625, 8172–8184 https://proceedings.neurips.cc/paper/2021/hash/44e65d3e9bc2f88b2b3d566de51a5381-Abstract.html
[6] A. Beznosikov, M. Takác, and A. Gasnikov, “Similarity, compression and local steps: three pillars of efficient communications for distributed variational inequalities”, NIPS'23: Proceedings of the 37th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 36, Curran Associates, Inc., Red Hook, NY, 2023, 1246, 28663–28677 https://proceedings.neurips.cc/paper_files/paper/2023/hash/5b4a459db23e6db9be2a128380953d96-Abstract-Conference.html
[7] S. Chezhegov, S. Skorik, N. Khachaturov, D. Shalagin, A. Avetisyan, A. Beznosikov, M. Takáč, Y. Kholodov, and A. Beznosikov, Local methods with adaptivity via scaling, 2024, 41 pp., arXiv: 2406.00846
[8] S. Ghadimi and Guanghui Lan, “Optimal stochastic approximation algorithms for strongly convex stochastic composite optimization. II. Shrinking procedures and optimal algorithms”, SIAM J. Optim., 23:4 (2013), 2061–2089 | DOI | MR | Zbl
[9] M. R. Glasgow, Honglin Yuan, and Tengyu Ma, “Sharp bounds for federated averaging (local SGD) and continuous perspective”, Proceedings of the 25th international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 151, 2022, 9050–9090 https://proceedings.mlr.press/v151/glasgow22a
[10] I. Goodfellow, Y. Bengio, and A. Courville, Deep learning, Adapt. Comput. Mach. Learn., MIT Press, Cambridge, MA, 2016, xxii+775 pp. | MR | Zbl
[11] E. Gorbunov, F. Hanzely, and P. Richtárik, “Local SGD: unified theory and new efficient methods”, Proceedings of the 24th international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 130, 2021, 3556–3564 https://proceedings.mlr.press/v130/gorbunov21a.html
[12] H. Hendrikx, Lin Xiao, S. Bubeck, F. Bach, and L. Massoulie, “Statistically preconditioned accelerated gradient method for distributed optimization”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 4203–4227 https://proceedings.mlr.press/v119/hendrikx20a.html
[13] P. Kairouz, H. B. McMahan, B. Avent, A. Bellet, M. Bennis, A. N. Bhagoji, K. Bonawitz, Z. Charles, G. Cormode, R. Cummings, et. al., “Advances and open problems in federated learning”, Found. Trends Mach. Learn., 14:1-2 (2021), 1–210 | DOI | Zbl
[14] S. P. Karimireddy, S. Kale, M. Mohri, S. Reddi, S. Stich, and A. T. Suresh, “Scaffold: Stochastic controlled averaging for federated learning”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 5132–5143 https://proceedings.mlr.press/v119/karimireddy20a.html
[15] A. Khaled, K. Mishchenko, and P. Richtárik, “Tighter theory for local SGD on identical and heterogeneous data”, Proceedings of the 23rd international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 108, 2020, 4519–4529, https://proceedings.mlr.press/v108/bayoumi20a.html
[16] A. Koloskova, N. Loizou, S. Boreiri, M. Jaggi, and S. Stich, “A unified theory of decentralized SGD with changing topology and local updates”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 5381–5393 https://proceedings.mlr.press/v119/koloskova20a.html
[17] J. Konečný, H. B. McMahan, F. X. Yu, P. Richtárik, A. T. Suresh, and D. Bacon, Federated learning: strategies for improving communication efficiency, 2017 (v1 – 2016), 10 pp., arXiv: 1610.05492
[18] D. Kovalev, A. Beznosikov, E. Borodich, A. Gasnikov, and G. Scutari, “Optimal gradient sliding and its application to optimal distributed optimization under similarity”, NIPS'22: Proceedings of the 36th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 35, Curran Associates, Inc., Red Hook, NY, 2022, 2427, 33494–33507 https://proceedings.neurips.cc/paper_files/paper/2022/hash/d88f6f81e1aaf606776ffdd06fdf24ef-Abstract-Conference.html
[19] Tian Li, A. K. Sahu, M. Zaheer, M. Sanjabi, A. Talwalkar, and V. Smith, “Federated optimization in heterogeneous networks”, Proc. Mach. Learn. Syst., 2 (2020), 429–450
[20] Xianfeng Liang, Shuheng Shen, Jingchang Liu, Zhen Pan, Enhong Chen, and Yifei Cheng, Variance reduced local SGD with lower communication complexity, 2019, 25 pp., arXiv: 1912.12844
[21] L. O. Mangasarian, “Parallel gradient distribution in unconstrained optimization”, SIAM J. Control Optim., 33:6 (1995), 1916–1925 | DOI | MR | Zbl
[22] B. McMahan, E. Moore, D. Ramage, S. Hampson, and B. A. Arcas, “Communication-efficient learning of deep networks from decentralized data”, Proceedings of the 20th international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 54, 2017, 1273–1282 https://proceedings.mlr.press/v54/mcmahan17a
[23] K. Mishchenko, G. Malinovsky, S. Stich, and P. Richtárik, “Proxskip: Yes! local gradient steps provably lead to communication acceleration! finally!”, Proceedings of the 39th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 162, 2022, 15750–15769 https://proceedings.mlr.press/v162/mishchenko22b
[24] A. Reisizadeh, A. Mokhtari, H. Hassani, A. Jadbabaie, and R. Pedarsani, “FedPAQ: A communication-efficient federated learning method with periodic averaging and quantization”, Proceedings of the 23rd international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 108, 2020, 2021–2031 https://proceedings.mlr.press/v108/reisizadeh20a.html
[25] H. Robbins and S. Monro, “A stochastic approximation method”, Ann. Math. Statistics, 22 (1951), 400–407 | DOI | MR | Zbl
[26] M. Schmidt and N. Le Roux, Fast convergence of stochastic gradient descent under a strong growth condition, 2013, 5 pp., arXiv: 1308.6370
[27] S. Shalev-Shwartz and S. Ben-David, Understanding machine learning. From theory to algorithms, Cambridge Univ. Press, Cambridge, 2014, xvi+397 pp. | DOI | Zbl
[28] O. Shamir, N. Srebro, and Tong Zhang, “Communication-efficient distributed optimization using an approximate Newton-type method”, Proceedings of the 31st international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 32, 2014, 1000–1008 https://proceedings.mlr.press/v32/shamir14.html
[29] P. Sharma, S. Kafle, P. Khanduri, S. Bulusu, K. Rajawat, and P. K. Varshney, Parallel restarted SPIDER – communication efficient distributed nonconvex optimization with optimal computation complexity, 2020 (v1 – 2019), 25 pp., arXiv: 1912.06036
[30] A. Spiridonoff, A. Olshevsky, and Y. Paschalidis, “Communication-efficient SGD: from local SGD to one-shot averaging”, NIPS'21: Proceedings of the 35th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 34, Curran Associates, Inc., Red Hook, NY, 2021, 1861, 24313–24326 https://proceedings.neurips.cc/paper_files/paper/2021/hash/cc06a6150b92e17dd3076a0f0f9d2af4-Abstract.html
[31] S. U. Stich, Local SGD converges fast and communicates little, 2019 (v1 – 2018), 19 pp., arXiv: 1805.09767
[32] J. Verbraeken, M. Wolting, J. Katzy, J. Kloppenburg, T. Verbelen, and J. S. Rellermeyer, “A survey on distributed machine learning”, ACM Comput. Surveys, 53:2 (2020), 30, 1–33 | DOI
[33] Jianyu Wang, V. Tantia, N. Ballas, and M. Rabbat, SlowMo: Improving communication-efficient distributed SGD with slow momentum, 2020 (v1 – 2019), 27 pp., arXiv: 1910.00643
[34] B. Woodworth, K. K. Patel, S. Stich, Zhen Dai, B. Bullins, B. Mcmahan, O. Shamir, and N. Srebro, “Is local SGD better than minibatch SGD?”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 10334–10343 https://proceedings.mlr.press/v119/woodworth20a
[35] Honglin Yuan and Tengyu Ma, “Federated accelerated stochastic gradient descent”, NIPS'20: Proceedings of the 34th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 33, Curran Associates, Inc., Red Hook, NY, 2020, 448, 5332–5344 https://proceedings.neurips.cc/paper_files/paper/2020/hash/39d0a8908fbe6c18039ea8227f827023-Abstract.html
[36] Jian Zhang, C. De Sa, I. Mitliagkas, and C. Ré, Parallel SGD: when does averaging help?, 2016, 13 pp., arXiv: 1606.07365