@article{VYURV_2020_9_1_a4,
author = {E. N. Akimova and R. A. Gareev},
title = {Application of analytical modeling of matrix-vector multiplication on multicore processors},
journal = {Vestnik \^U\v{z}no-Uralʹskogo gosudarstvennogo universiteta. Seri\^a Vy\v{c}islitelʹna\^a matematika i informatika},
pages = {69--82},
year = {2020},
volume = {9},
number = {1},
language = {ru},
url = {http://geodesic.mathdoc.fr/item/VYURV_2020_9_1_a4/}
}
TY - JOUR AU - E. N. Akimova AU - R. A. Gareev TI - Application of analytical modeling of matrix-vector multiplication on multicore processors JO - Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika PY - 2020 SP - 69 EP - 82 VL - 9 IS - 1 UR - http://geodesic.mathdoc.fr/item/VYURV_2020_9_1_a4/ LA - ru ID - VYURV_2020_9_1_a4 ER -
%0 Journal Article %A E. N. Akimova %A R. A. Gareev %T Application of analytical modeling of matrix-vector multiplication on multicore processors %J Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika %D 2020 %P 69-82 %V 9 %N 1 %U http://geodesic.mathdoc.fr/item/VYURV_2020_9_1_a4/ %G ru %F VYURV_2020_9_1_a4
E. N. Akimova; R. A. Gareev. Application of analytical modeling of matrix-vector multiplication on multicore processors. Vestnik Ûžno-Uralʹskogo gosudarstvennogo universiteta. Seriâ Vyčislitelʹnaâ matematika i informatika, Tome 9 (2020) no. 1, pp. 69-82. http://geodesic.mathdoc.fr/item/VYURV_2020_9_1_a4/
[1] J. Yu, A. Lukefahr, D. Palframan, et al., “Scalpel: Customizing DNN Pruning to the Underlying Hardware Parallelism”, SIGARCH Computer Architecture News, 45:2 (2017), 548–560 | DOI
[2] X. Yang, S. Parthasarathy, P. Sadayappan, “Fast Sparse Matrix-vector Multiplication on GPUs: Implications for Graph Mining”, Proceedings of the VLDB Endowment, 4:4 (2011), 231–242 | DOI
[3] D. Kaushik, W. Gropp, M. Minkoff, et al., “Improving the Performance of Tensor Matrix Vector Multiplication in Cumulative Reaction Probability Based Quantum Chemistry Codes”, High Performance Computing, HiPC 2008, Springer, Berlin–Heidelberg, 2008, 120–130 | DOI
[4] P. S. Martyshko, E. N. Akimova, V. E. Misilov, “Solving the structural inverse gravity problem by the modified gradient methods”, Izvestiya, Physics of the Solid Earth, 52:5 (2016), 704–708 | DOI
[5] S. A. Hassan, M. M. Mahmoud, A. Hemeida, et al., Effective Implementation of Matrix-Vector Multiplication on Intel’s AVX Multicore Processor, 51 (2018), 158–175 | DOI
[6] J. Liang, Y. Zhang, Optimization of GEMV on Intel AVX processor, 9 (2016), 47–60 | DOI
[7] T. M. Low, F. D. Igual, T. M. Smith, “Quintana-Orti E.S., Analytical Modeling Is Enough for High-Performance BLIS”, ACM Transactions on Mathematical Software, 43:2 (2016), 1–18 | DOI | MR
[8] G. Frison, Algorithms and Methods for High-Performance Model Predictive Control, Technical University of Denmark, 2016, 345 pp.
[9] E. N. Akimova, R. A. Gareev, “Algorithm of Automatic Parallelization of Generalized Matrix Multiplication”, CEUR Workshop Proceedings, 2017, 1–10
[10] R. Gareev, T. Grosser, M. Kruse, “High-Performance Generalized Tensor Operations: A Compiler-Oriented Approach”, ACM Transactions on Architecture and Code Optimization, 15:3 (2018), 1–34 | DOI
[11] Intel. Intel Math Kernel Library (Intel MKL) } {\tt https://software.intel.com/en-us/mkl
[12] Z. e. Van, D. e. Van, “BLIS: A Framework for Rapidly Instantiating BLAS Functionality”, ACM Transactions on Mathematical Software, 41:3 (2015), 1–33 | DOI | MR
[13] Z. Xianyi, W. Qian, Z. Yunquan, “Model-driven level 3 BLAS performance optimization on Loongson 3A processor”, Parallel and Distributed Systems, 2012 IEEE 18th International Conference on Parallel and Distributed Systems (ICPADS), IEEE, 2012, 684–691 | DOI
[14] P. Feautrier, C. Lengauer, “Polyhedron Model”, Encyclopedia of Parallel Computing, 2011, 1581–1592, Springer, US, Boston, MA | DOI
[15] T. Grosser, A. Groblinger, C. Lengauer, “Polly—Performing polyhedral optimizations on a low-level intermediate representation”, Parallel Process Lett, 22:4 (2012) | DOI | MR
[16] C. Lattner, LLVM: An Infrastructure for Multi-Stage Optimization, Master’s Thesis } {\tt http://llvm.cs.uiuc.edu | Zbl
[17] E. Apra, M. Klemm, K. Kowalski, “Efficient Implementation of Many-Body Quantum Chemical Methods on the Intel$^\circledR$Xeon Phi$^\mathrm{TM}$Coprocessor”, International Conference for High Performance Computing, Networking, Storage and Analysis, 2014, 674–684 | DOI
[18] P. Springer, P. Bientinesi, “Design of a high-performance GEMM-like tensor-tensor multiplication”, ACM Trans. Math Softw., 44:3 (2018), 1–29 | DOI | MR
[19] D. Matthews, “High-performance tensor contraction without BLAS”, SIAM Journal on Scientific Computing, 40 (2016) | DOI | MR
[20] J. Doerfert, K. Streit, S. Hack, et al., Polly’s polyhedral scheduling in the presence of reductions, 2015, arXiv: 1505.07716
[21] B. V. Numerov, “Interpretation of gravitational observations in the case of one contact surface”, Dokl. Akad Nauk SSSR, 1930, 569–574 | Zbl
[22] V. V. Vasin, I. I. Eremin, “Operators and iterative processes of Fejer type: theory and applications”, Walter de Gruyter, 53 (2009), 155 | DOI | MR