@article{VYURV_2014_3_3_a1,
author = {A. A. Bondarenko and M. V. Iakobovski},
title = {Fault tolerance for {HPC} by using local checkpoints},
journal = {Vestnik \^U\v{z}no-Uralʹskogo gosudarstvennogo universiteta. Seri\^a Vy\v{c}islitelʹna\^a matematika i informatika},
pages = {20--36},
year = {2014},
volume = {3},
number = {3},
language = {ru},
url = {http://geodesic.mathdoc.fr/item/VYURV_2014_3_3_a1/}
}
TY - JOUR AU - A. A. Bondarenko AU - M. V. Iakobovski TI - Fault tolerance for HPC by using local checkpoints JO - Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika PY - 2014 SP - 20 EP - 36 VL - 3 IS - 3 UR - http://geodesic.mathdoc.fr/item/VYURV_2014_3_3_a1/ LA - ru ID - VYURV_2014_3_3_a1 ER -
%0 Journal Article %A A. A. Bondarenko %A M. V. Iakobovski %T Fault tolerance for HPC by using local checkpoints %J Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika %D 2014 %P 20-36 %V 3 %N 3 %U http://geodesic.mathdoc.fr/item/VYURV_2014_3_3_a1/ %G ru %F VYURV_2014_3_3_a1
A. A. Bondarenko; M. V. Iakobovski. Fault tolerance for HPC by using local checkpoints. Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika, Tome 3 (2014) no. 3, pp. 20-36. http://geodesic.mathdoc.fr/item/VYURV_2014_3_3_a1/
[1] W. Bland, A. Bouteiller, T. Hérault, G. Bosilca, J. Dongarra, “Post-Failure Recovery of MPI Communication Capability: Design and Rationale”, International Journal of High Performance Computing Applications, 27:3 (2013), 244–254 | DOI
[2] F. Cappello, “Fault Tolerance in Petascale/Exascale Systems: Current Knowledge, Challenges and Research Opportunities”, International Journal of High Performance Computing Applications, 23:3 (2009), 212–226 | DOI
[3] C.-H. Hsu, W.-C. Feng, “A Power-aware Run-time System for High-performance Computting”, Proceedings of SC'05: The ACM/IEEE International Conference on High-Performance Computing, Networking, and Storage (Seattle, Washington USA November 12 - 18, 2005), IEEE Press, 2005, 1–9 | DOI
[4] D. Sorin, Fault Tolerant Computer Architecture, Synthesis Lectures on Computer Architecture, Morgan, 2009, 104 pp. | DOI
[5] E.N. Elnozahy, L. Alvisi, Y. Wang, D.B. Johnson, “A Survey of Rollback-Recovery Protocols in Message-Passing Systems”, ACM Computing Surveys, 34:3 (2002), 375–408 | DOI
[6] I. Koren, C.M. Krishna, Fault-Tolerant Systems, Morgan Kaufmann Publishers, San Francisco, CA, 2007, 378 pp.
[7] A.S. Tanenbaum, M. Steen, Distributed Systems: Principles and Paradigms, Prentice Hall PTR, New Jersey, 2002, 803 pp.
[8] P.M. Kogge, et al, ExaScale Computing Study: Technology Challenges in Achieving Exascale Systems (Univ. of Notre Dame, CSE Dept.), , 2008 (data obrascheniya: 25.07.2014) http://www.cse.nd.edu/Reports/2008/TR-2008-13.pdf
[9] A. Avizienis, J.C. Laprie, B. Randell, C. Landwehr, “Basic Concepts and Taxonomy of Dependable and Secure Computing”, IEEE Transactions on Dependable and Secure Computing, 1:1 (2004), 11–33 | DOI
[10] P. Jalote, Fault Tolerance in Distributed Systems, Prentice Hall, New Jersey, 1994, 448 pp.
[11] G. Tel', Introduction to Distributed Algorithms, Cambridge University Press, 2000, 596 pp.
[12] The Computer Failure Data Repository, (data obrascheniya: 25.07.2014) https://www.usenix.org/cfdr
[13] Addressing the Challenges of Petascale Computing for Scientific Discovery on Information Storage Capacity, Performance, Concurrency, Reliability, Availability, and Manageability, (data obrascheniya: 25.07.2014) http://pdsi.nersc.gov/
[14] Y. Yuan, Y. Wu, Q. Wang, G. Yang, W. Zheng, “Job Failures in High Performance Computing Systems: A Large-scale Empirical Study”, Computers Mathematics with Applications, 63:2 (2012), 365–377 | DOI
[15] X. Dong, N. Muralimanohar, N.P. Jouppi, Y. Xie, “A Case Study of Incremental and Background Hybrid In-Memory Checkpointing”, Proceedings of the 2010 Exascale Evaluation and Research Techniques Workshop (Pittsburgh, PA, USA March - 14, 2010), ACM, 2010, 119–147
[16] B. Schroeder, G.A. Gibson, Disk Failures in the Real World: What Does an MTTF of 1,000,000 Hours Mean to You?, Proceedings of the 5th USENIX Conference on File and Storage Technologies (San Jose, CA, USA February 13-16 2007), USENIX, 2007, 1–16
[17] K.B. Ferreira, R. Riesen, P.G. Bridges, D. Arnold, R. Brightwell, “Accelerating Incremental Checkpointing for Extreme-scale Computing”, Future Generation Computer Systems, 30:1 (2014), 66–77 | DOI
[18] A.Yu. Polyakov, A.A. Danekina, “Optimization of Time Creation and Checkpoint's Volume for Parallel Programs”, Bulletin of the Siberian State University of Telecommunications and Information Sciences, 2 (2010), 87–100
[19] N.H. Vaidya, “A Case for Two-Level Distributed Recovery Schemes”, Proceedings of the ACM SIGMETRICS Joint International Conference on Measurement and Modeling of Computer Systems (Ottawa, Canada, May 15-19 1995), ACM, 1995, 64–73
[20] J.S. Plank, K. Li, M.A, Puening, “Diskless Checkpointing”, IEEE Transanctions on Parallel Distributed Systems, 9:10 (1998), 972–986 | DOI
[21] X-COM Parallel.ru, (data obrascheniya: 25.07.2014) http://x-com.parallel.ru/node/10
[22] A.V. Baranov, A.V. Kiselev, E.A. Kiselev, V.V. Korneev, D.V. Semenov, Software Package «Pyramid» for Organization of Parallel Computing with Parallelization of Data, (accessed: 25.07.2014) http://agora.guru.ru/abrau2010/pdf/299.pdf
[23] OpenTS - Technology and Software Support for Parallelization of Data-Parallel Applications, (accessed: 25.07.2014) http://skif.pereslavl.ru/psi-info/rcms-open.ts/index.ru.html
[24] HTCondor High Throughput Computing, (data obrascheniya: 25.07.2014) http://research.cs.wisc.edu/htcondor/index.html
[25] Berkeley Lab Checkpoint/Restart (BLCR) for LINUX, (data obrascheniya: 25.07.2014) http://crd.lbl.gov/groupsdepts/ftg/projects/current-projects/BLCR/
[26] Open MPI: Open Source High Performance Computing, (data obrascheniya: 25.07.2014) http://www.open-mpi.org
[27] MPICH, (data obrascheniya: 25.07.2014) http://www.mpich.org
[28] MVAPICH: MPI over InfiniBand, 10GigE/iWARP and RoCE, (data obrascheniya: 25.07.2014) http://mvapich.cse.ohio-state.edu
[29] I.P. Egwutuoha, D. Levy, B. Selic, S. Chen, “A Survey of Fault Tolerance Mechanisms and Checkpoint/Restart Implementations for High Performance Computing Systems”, The Journal of Supercomputing, 65:3 (2013), 1302–1326 | DOI
[30] Message Passing Interface Forum, (data obrascheniya: 25.07.2014) http://www.mpi-forum.org/
[31] ICL Fault Tolerance, (data obrascheniya: 25.07.2014) http://fault-tolerance.org/ulfm/ulfm-specification
[32] X. Dong, N. Muralimanohar, N. Jouppi, R. Kaufmann, Y. Xie, “Leveraging 3D PCRAM Technologies to Reduce Checkpoint Overhead for Future Exascale Systems”, Proceedings of the Conference on High Performance Computing Networking, Storage and Analysis (Portland, Oregon USA November 14-20, 2009), ACM, 2009, 57–68 | DOI
[33] FT-MPI, (data obrascheniya: 25.07.2014) http://icl.cs.utk.edu/ftmpi/people/index.html