@article{VYURV_2019_8_2_a4,
author = {A. A. Bondarenko and P. A. Lyakhov and M. V. Yakobovskiy},
title = {Coordinated checkpointing with sender-based logging and asynchronous recovery from failure},
journal = {Vestnik \^U\v{z}no-Uralʹskogo gosudarstvennogo universiteta. Seri\^a Vy\v{c}islitelʹna\^a matematika i informatika},
pages = {76--91},
year = {2019},
volume = {8},
number = {2},
language = {ru},
url = {http://geodesic.mathdoc.fr/item/VYURV_2019_8_2_a4/}
}
TY - JOUR AU - A. A. Bondarenko AU - P. A. Lyakhov AU - M. V. Yakobovskiy TI - Coordinated checkpointing with sender-based logging and asynchronous recovery from failure JO - Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika PY - 2019 SP - 76 EP - 91 VL - 8 IS - 2 UR - http://geodesic.mathdoc.fr/item/VYURV_2019_8_2_a4/ LA - ru ID - VYURV_2019_8_2_a4 ER -
%0 Journal Article %A A. A. Bondarenko %A P. A. Lyakhov %A M. V. Yakobovskiy %T Coordinated checkpointing with sender-based logging and asynchronous recovery from failure %J Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika %D 2019 %P 76-91 %V 8 %N 2 %U http://geodesic.mathdoc.fr/item/VYURV_2019_8_2_a4/ %G ru %F VYURV_2019_8_2_a4
A. A. Bondarenko; P. A. Lyakhov; M. V. Yakobovskiy. Coordinated checkpointing with sender-based logging and asynchronous recovery from failure. Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika, Tome 8 (2019) no. 2, pp. 76-91. http://geodesic.mathdoc.fr/item/VYURV_2019_8_2_a4/
[1] B. Schroeder, G. A. Gibson, “Understanding Failures in Petascale Computers”, Journal of Physics: Conference Series, 78:1 (2007), 12–22 | DOI
[2] C.-H. Hsu, W.-C. Feng, “A Power-aware Run-time System for High-performance Computing”, Proceedings of the 2005 ACM/IEEE Conference on Supercomputing (Seattle, WA, USA, November 12 – 18, 2005)), IEEE, 2005, 1–9 | DOI
[3] Martino C.D., Kalbarczyk Z., Iyer R.K., Baccanico F., Fullop J., Kramer W., “Lessons Learned from the Analysis of System Failures at Petascale: The Case of Blue Waters”, 44th Annual IEEE/IFIP International Conference on Dependable Systems and Networks (Atlanta, Georgia, USA, June 23 – 26, 2014), IEEE, 2014, 610–621 | DOI
[4] Dongarra J., Herault T., Robert Y. Fault-tolerance Techniques for High-performance Computing, Springer, Cham, 2015, 320 pp. | DOI
[5] Berkeley Lab Checkpoint/Restart (BLCR) for LINUX } {\tt http://crd.lbl.gov/.../BLCR/
[6] Cappello F., Geist A., Gropp W., Kale S., Kramer B., Snir M.,, “Toward Exascale Resilience: 2014 Update”, Supercomputing Frontiers and Innovations, 1:1 (2014), 5–28 | DOI
[7] E. N. M. Elnozahy, L. Alvisi, Y.-M. Wang, D. B. Johnson, “A Survey of Rollback-recovery Protocols in Message-passing Systems”, ACM Comput. Surv, 34:3 (2002), 375–408 | DOI
[8] A. Bouteiller, T. Herault, G. Bosilca, P. Du, J. Dongarra, “Algorithm-based Fault Tolerance for Dense Matrix Factorizations, Multiple Failures and Accuracy”, ACM Transactions on Parallel Computing, 1:2 (2015), 1–28 | DOI
[9] C. Engelmann, G. R. Vallee, T. Naughton, S. L. Scott, “Proactive Fault Tolerance Using Preemptive Migration”, 17th Euromicro International Conference on Parallel, Distributed and Network-based Processing (Weimar, Germany, February 18 – 20, 2009), IEEE, 2009, 252–257 | DOI
[10] A. A. Bondarenko, M. V. Yakobovskiy, “Fault Tolerance for HPC by Using Local Checkpoints”, Bulletin of South Ural State University. Series: Computational Mathematics and Software Engineering, 3:3 (2014), 20–36 | DOI
[11] S. Di, M. S. Bouguerra, L. Bautista-Gomez, F. Cappello, “Optimization of Multi-level Checkpoint Model for Large Scale HPC Applications”, 28th International Parallel and Distributed Processing Symposium (Phoenix, Arizona, USA, May 19 – 23, 2014), IEEE, 2014, 1181–1190 | DOI
[12] A. Benoit, A. Cavelan, V. Le Fèvre, Y. Robert, H. Sun, “Towards Optimal Multi-level Checkpointing”, IEEE Transactions on Computers, 66:7 (2016), 1212–1226 | DOI
[13] S. Di, Y. Robert, F. Vivien, F. Cappello, “Toward an Optimal Online Checkpoint Solution under a Two-level HPC Checkpoint Model”, IEEE Transactions on Parallel and Distributed Systems, 28:1 (2016), 244–259 | DOI
[14] Fault Tolerance Research Hub } {\tt http://fault-tolerance.org/
[15] A. A. Bondarenko, P. A. Lyakhov, M. V. Yakobovskiy, “The Overheads Associated with Multi-level Coordinated Checkpointing”, Parallel Computational Technologies (PCT'2017): Proceedings of the International Scientific Conference (Kazan, Russia, 3 – 7 April, 2017), Publishing of the South Ural State University, Chelyabinsk, 2017, 262–270