@article{RM_2024_79_6_a5,
author = {S. A. Chezhegov and S. N. Skorik and N. Khachaturov and D. S. Shalagin and A. A. Avetisyan and M. Tak\'a\v{c} and Y. A. Kholodov and A. N. Beznosikov},
title = {Local methods with adaptivity via scaling},
journal = {Trudy Matematicheskogo Instituta imeni V.A. Steklova},
pages = {1051--1091},
year = {2024},
volume = {79},
number = {6},
language = {en},
url = {http://geodesic.mathdoc.fr/item/RM_2024_79_6_a5/}
}
TY - JOUR AU - S. A. Chezhegov AU - S. N. Skorik AU - N. Khachaturov AU - D. S. Shalagin AU - A. A. Avetisyan AU - M. Takáč AU - Y. A. Kholodov AU - A. N. Beznosikov TI - Local methods with adaptivity via scaling JO - Trudy Matematicheskogo Instituta imeni V.A. Steklova PY - 2024 SP - 1051 EP - 1091 VL - 79 IS - 6 UR - http://geodesic.mathdoc.fr/item/RM_2024_79_6_a5/ LA - en ID - RM_2024_79_6_a5 ER -
%0 Journal Article %A S. A. Chezhegov %A S. N. Skorik %A N. Khachaturov %A D. S. Shalagin %A A. A. Avetisyan %A M. Takáč %A Y. A. Kholodov %A A. N. Beznosikov %T Local methods with adaptivity via scaling %J Trudy Matematicheskogo Instituta imeni V.A. Steklova %D 2024 %P 1051-1091 %V 79 %N 6 %U http://geodesic.mathdoc.fr/item/RM_2024_79_6_a5/ %G en %F RM_2024_79_6_a5
S. A. Chezhegov; S. N. Skorik; N. Khachaturov; D. S. Shalagin; A. A. Avetisyan; M. Takáč; Y. A. Kholodov; A. N. Beznosikov. Local methods with adaptivity via scaling. Trudy Matematicheskogo Instituta imeni V.A. Steklova, Tome 79 (2024) no. 6, pp. 1051-1091. http://geodesic.mathdoc.fr/item/RM_2024_79_6_a5/
[1] S. Shalev-Shwartz and S. Ben-David, Understanding machine learning. From theory to algorithms, Cambridge Univ. Press, Cambridge, 2014, xvi+397 pp. | DOI | Zbl
[2] I. Goodfellow, Y. Bengio, and A. Courville, Deep learning, Adapt. Comput. Mach. Learn., MIT Press, Cambridge, MA, 2016, xxii+775 pp. | MR | Zbl
[3] S. Arora, N. Cohen, and E. Hazan, “On the optimization of deep networks: implicit acceleration by overparameterization”, Proceedings of the 35th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 80, 2018, 244–253 https://proceedings.mlr.press/v80/arora18a.html
[4] V. Smith, S. Forte, Chenxin Ma, M. Takáč, M. I. Jordan, and M. Jaggi, “CoCoA: A general framework for communication-efficient distributed optimization”, J. Mach. Learn. Res., 18 (2018), 230, 49 pp. | MR | Zbl
[5] K. Mishchenko, E. Gorbunov, M. Takáč, and P. Richtárik, Distributed learning with compressed gradient differences, 2023 (v1 – 2019), 59 pp., arXiv: 1901.09269
[6] J. Verbraeken, M. Wolting, J. Katzy, J. Kloppenburg, T. Verbelen, and J. S. Rellermeyer, “A survey on distributed machine learning”, ACM Comput. Surveys, 53:2 (2020), 30, 33 pp. | DOI
[7] S. Chraibi, A. Khaled, D. Kovalev, P. Richtárik, A. Salim, and M. Takáč, Distributed fixed point methods with compressed iterates, 2019, 15 pp., arXiv: 1912.09925
[8] V. Pirau, A. Beznosikov, M. Takáč, V. Matyukhin, and A. Gasnikov, “Preconditioning meets biased compression for efficient distributed optimization”, Comput. Manag. Sci., 21:1 (2024), 14, 22 pp. | DOI | MR | Zbl
[9] J. Konečný, H. B. McMahan, F. X. Yu, P. Richtárik, A. T. Suresh, and D. Bacon, Federated learning: strategies for improving communication efficiency, 2017 (v1 – 2016), 10 pp., arXiv: 1610.05492
[10] P. Kairouz, H. B. McMahan, B. Avent, A. Bellet, M. Bennis, A. N. Bhagoji, K. Bonawitz, Z. Charles, G. Cormode, R. Cummings, et al., “Advances and open problems in federated learning”, Found. Trends Mach. Learn., 14:1-2 (2021), 1–210 | DOI | Zbl
[11] S. P. Karimireddy, S. Kale, M. Mohri, S. Reddi, S. Stich, and A. T. Suresh, “SCAFFOLD: Stochastic controlled averaging for federated learning”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 5132–5143 https://proceedings.mlr.press/v119/karimireddy20a.html
[12] Y. Arjevani and O. Shamir, “Communication complexity of distributed convex learning and optimization”, NIPS'15: Proceedings of the 28th international conference on neural information processing systems, v. 1, Adv. Neural Inf. Process. Syst., 28, MIT Press, Cambridge, MA, 2015, 1756–1764 https://papers.nips.cc/paper_files/paper/2015/hash/7fec306d1e665bc9c748b5d2b99a6e97-Abstract.html
[13] D. Alistarh, D. Grubic, J. Z. Li, R. Tomioka, and M. Vojnovic, “QSGD: Communication-efficient SGD via gradient quantization and encoding”, NIPS'17: Proceedings of the 31st international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 30, Curran Associates, Inc., Red Hook, NY, 2017, 1707–1718 https://proceedings.neurips.cc/paper_files/paper/2017/hash/6c340f25839e6acdc73414517203f5f0-Abstract.html
[14] A. Beznosikov, S. Horváth, P. Richtárik, and M. Safaryan, On biased compression for distributed learning, 2024 (v1 – 2020), 50 pp., arXiv: 2002.12410
[15] L. O. Mangasarian, “Parallel gradient distribution in unconstrained optimization”, SIAM J. Control Optim., 33:6 (1995), 1916–1925 | DOI | MR | Zbl
[16] S. U. Stich, Local SGD converges fast and communicates little, 2019 (v1 – 2018), 19 pp., arXiv: 1805.09767
[17] B. McMahan, E. Moore, D. Ramage, S. Hampson, and B. A. Arcas, “Communication-efficient learning of deep networks from decentralized data”, Proceedings of the 20th international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 54, 2017, 1273–1282 https://proceedings.mlr.press/v54/mcmahan17a
[18] Jianyu Wang, V. Tantia, N. Ballas, and M. Rabbat, SlowMo: Improving communication-efficient distributed SGD with slow momentum, 2020 (v1 – 2019), 27 pp., arXiv: 1910.00643
[19] D. Basu, D. Data, C. Karakus, and S. N. Diggavi, “Qsparse-local-SGD: distributed SGD with quantization, sparsification, and local computations”, IEEE J. Sel. Areas Inform. Theory, 1:1 (2020), 217–226 | DOI
[20] A. Reisizadeh, A. Mokhtari, H. Hassani, A. Jadbabaie, and R. Pedarsani, “FedPAQ: A communication-efficient federated learning method with periodic averaging and quantization”, Proceedings of the 23rd international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 108, 2020, 2021–2031 https://proceedings.mlr.press/v108/reisizadeh20a.html
[21] Xianfeng Liang, Shuheng Shen, Jingchang Liu, Zhen Pan, Enhong Chen, and Yifei Cheng, Variance reduced local SGD with lower communication complexity, 2019, 25 pp., arXiv: 1912.12844
[22] P. Sharma, S. Kafle, P. Khanduri, S. Bulusu, K. Rajawat, and P. K. Varshney, Parallel restarted SPIDER–communication efficient distributed nonconvex optimization with optimal computation complexity, 2020 (v1 – 2019), 25 pp., arXiv: 1912.06036
[23] K. Mishchenko, G. Malinovsky, S. Stich, and P. Richtárik, “ProxSkip: Yes! Local gradient steps provably lead to communication acceleration! Finally!”, Proceedings of the 39th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 162, 2022, 15750–15769 https://proceedings.mlr.press/v162/mishchenko22b
[24] O. Shamir, N. Srebro, and Tong Zhang, “Communication-efficient distributed optimization using an approximate Newton-type method”, Proceedings of the 31st international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 32, 2014, 1000–1008 https://proceedings.mlr.press/v32/shamir14.html
[25] H. Hendrikx, Lin Xiao, S. Bubeck, F. Bach, and L. Massoulie, “Statistically preconditioned accelerated gradient method for distributed optimization”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 4203–4227 https://proceedings.mlr.press/v119/hendrikx20a.html
[26] A. Beznosikov, G. Scutari, A. Rogozin, and A. Gasnikov, “Distributed saddle-point problems under similarity”, NIPS'21: Proceedings of the 35th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 34, Curran Associates, Inc., Red Hook, NY, 2021, 625, 8172–8184 https://proceedings.neurips.cc/paper_files/paper/2021/hash/44e65d3e9bc2f88b2b3d566de51a5381-Abstract.html
[27] D. Kovalev, A. Beznosikov, E. Borodich, A. Gasnikov, and G. Scutari, Optimal gradient sliding and its application to distributed optimization under similarity, 2022, 24 pp., arXiv: 2205.15136
[28] D. P. Kingma and J. L. Ba, Adam: a method for stochastic optimization, 2017 (v1 – 2014), 15 pp., arXiv: 1412.6980
[29] T. Tieleman and G. Hinton, “Lecture 6.5-rmsprop: Divide the gradient by a running average of its recent magnitude”, COURSERA: Neural networks for machine learning, 4:2 (2012), 26–31
[30] J. Duchi, E. Hazan, and Y. Singer, “Adaptive subgradient methods for online learning and stochastic optimization”, J. Mach. Learn. Res., 12 (2011), 2121–2159 | MR | Zbl
[31] C. Bekas, E. Kokiopoulou, and Y. Saad, “An estimator for the diagonal of a matrix”, Appl. Numer. Math., 57:11-12 (2007), 1214–1229 | DOI | MR | Zbl
[32] A. Sadiev, A. Beznosikov, A. J. Almansoori, D. Kamzolov, R. Tappenden, and M. Takáč, “Stochastic gradient methods with preconditioned updates”, J. Optim. Theory Appl., 201:2 (2024), 471–489 | DOI | MR | Zbl
[33] M. Jahani, S. Rusakov, Zheng Shi, P. Richtárik, M. W. Mahoney, and M. Takáč, Doubly adaptive scaled algorithm for machine learning using second-order information, 2021, 44 pp., arXiv: 2109.05198
[34] J. E. Dennis, Jr., and J. J. Moré, “Quasi-Newton methods, motivation and theory”, SIAM Rev., 19:1 (1977), 46–89 | DOI | MR | Zbl
[35] R. Fletcher, Practical methods of optimization, 2nd ed., Wiley-Intersci. [John Wiley Sons], New York, 2013, xiv+436 pp. | MR | Zbl
[36] A. Khaled, K. Mishchenko, and P. Richtárik, “Tighter theory for local SGD on identical and heterogeneous data”, Proceedings of the 23rd international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 108, 2020, 4519–4529 https://proceedings.mlr.press/v108/bayoumi20a.html
[37] A. Koloskova, N. Loizou, S. Boreiri, M. Jaggi, and S. Stich, “A unified theory of decentralized SGD with changing topology and local updates”, Proceedings of the 37th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 119, 2020, 5381–5393 https://proceedings.mlr.press/v119/koloskova20a.html
[38] A. Beznosikov, P. Dvurechenskii, A. Koloskova, V. Samokhin, S. U. Stich, and A. Gasnikov, “Decentralized local stochastic extra-gradient for variational inequalities”, NIPS'22: Proceedings of the 36th international conference on neural information processing systems, Adv. Neural Inf. Process. Syst., 35, Curran Associates, Inc., Red Hook, NY, 2022, 2762, 38116–38133 https://proceedings.neurips.cc/paper_files/paper/2022/hash/f9379afacdbabfdc6b060972b60f9ab8-Abstract-Conference.html
[39] M. R. Glasgow, Honglin Yuan, and Tengyu Ma, “Sharp bounds for federated averaging (local SGD) and continuous perspective”, Proceedings of the 25th international conference on artificial intelligence and statistics, Proc. Mach. Learn. Res. (PMLR), 151, 2022, 9050–9090 https://proceedings.mlr.press/v151/glasgow22a
[40] A. Sadiev, A. Beznosikov, A. J. Almansoori, D. Kamzolov, R. Tappenden, and M. Takáč, Stochastic gradient methods with preconditioned updates, 2024 (v1 – 2022), 40 pp., arXiv: 2206.00285
[41] A. Beznosikov, A. Alanov, D. Kovalev, M. Takáč, and A. Gasnikov, On scaled methods for saddle point problems, 2023 (v1 – 2022), 54 pp., arXiv: 2206.08303
[42] S. Reddi, Z. Charles, M. Zaheer, Z. Garrett, K. Rush, J. Konečný, S. Kumar, and H. B. McMahan, Adaptive federated optimization, 2021 (v1 – 2020), 38 pp., arXiv: 2003.00295
[43] Y. Nesterov, Introductory lectures on convex optimization. A basic course, Appl. Optim., 87, Kluwer Acad. Publ., Boston, MA, 2004, xviii+236 pp. | DOI | MR | Zbl
[44] A. S. Nemirovsky and D. B. Yudin, Problem complexity and method efficiency in optimization, Wiley-Intersci. Ser. Discrete Math., Wiley-Intersci. Publ., John Wiley Sons, Inc., New York, 1983, xv+388 pp. | MR | Zbl
[45] Lam Nguyen, Phuong Ha Nguyen, M. Dijk, P. Richtárik, K. Scheinberg, and M. Takác, “SGD and Hogwild! Convergence without the bounded gradients assumption”, Proceedings of the 35th international conference on machine learning, Proc. Mach. Learn. Res. (PMLR), 80, 2018, 3750–3758 https://proceedings.mlr.press/v80/nguyen18c
[46] Zhewei Yao, A. Gholami, Sheng Shen, M. Mustafa, K. Keutzer, and M. Mahoney, “ADAHESSIAN: An adaptive second order optimizer for machine learning”, Proc. Int. AAAI Conf., 35:12 (2021), 10665–10673 | DOI
[47] A. Défossez, L. Bottou, F. Bach, and N. Usunier, A simple convergence proof of Adam and Adagrad, 2022 (v1 – 2020), 30 pp., arXiv: 2003.02395
[48] A. Krizhevsky, Learning multiple layers of features from tiny images, Tech. rep., Univ. of Toronto, Toronto, ON, 2009, 60 pp. pp. https://www.cs.toronto.edu/~kriz/learning-features-2009-TR.pdf
[49] Kaiming He, Xiangyu Zhang, Shaoqing Ren, and Jian Sun, “Deep residual learning for image recognition”, Proceedings of the 2016 IEEE conference on computer vision and pattern recognition (Las Vegas, NV), IEEE, 2016, 770–778 | DOI