@article{BLIS1,
  author      = {Field G. {Van~Zee} and Robert A. {van~de~Geijn}},
  title       = {{BLIS}: A Framework for Rapidly Instantiating {BLAS} Functionality},
  journal     = {ACM Transactions on Mathematical Software},
  volume      = {41},
  number      = {3},
  pages       = {14:1--14:33},
  month       = {June},
  year        = {2015},
  issue_date  = {June 2015},
  url         = {https://doi.acm.org/10.1145/2764454},
}
@inproceedings{10.1145/2503210.2503219,
  author = {Wang, Qian and Zhang, Xianyi and Zhang, Yunquan and Yi, Qing},
  title = {AUGEM: Automatically Generate High Performance Dense Linear Algebra Kernels on X86 CPUs},
  year = {2013},
  isbn = {9781450323789},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  url = {https://doi.org/10.1145/2503210.2503219},
  doi = {10.1145/2503210.2503219},
  abstract = {Basic Liner algebra subprograms (BLAS) is a fundamental library in scientific computing. In this paper, we present a template-based optimization framework, AUGEM, which can automatically generate fully optimized assembly code for several dense linear algebra (DLA) kernels, such as GEMM, GEMV, AXPY and DOT, on varying multi-core CPUs without requiring any manual interference from developers. In particular, based on domain-specific knowledge about algorithms of the DLA kernels, we use a collection of parameterized code templates to formulate a number of commonly occurring instruction sequences within the optimized low-level C code of these DLA kernels. Then, our framework uses a specialized low-level C optimizer to identify instruction sequences that match the pre-defined code templates and thereby translates them into extremely efficient SSE/AVX instructions. The DLA kernels generated by our template-based approach surpass the implementations of Intel MKL and AMD ACML BLAS libraries, on both Intel Sandy Bridge and AMD Piledriver processors.},
  booktitle = {Proceedings of the International Conference on High Performance Computing, Networking, Storage and Analysis},
  articleno = {25},
  numpages = {12},
  keywords = {DLA code optimization, auto-tuning, code generation},
  location = {Denver, Colorado},
  series = {SC '13}
}
@BOOK{lapack99,
  AUTHOR = {Anderson, E. and Bai, Z. and Bischof, C. and
  Blackford, S. and Demmel, J. and Dongarra, J. and
  Du Croz, J. and Greenbaum, A. and Hammarling, S. and
  McKenney, A. and Sorensen, D.},
  TITLE = {{LAPACK} Users' Guide},
  EDITION = {Third},
  PUBLISHER = {Society for Industrial and Applied Mathematics},
  YEAR = {1999},
  ADDRESS = {Philadelphia, PA},
  ISBN = {0-89871-447-8 (paperback)} 
} 
@MISC{eigenweb,
  author = {Ga\"{e}l Guennebaud and Beno\^{i}t Jacob and others},
  title = {Eigen v3},
  howpublished = {http://eigen.tuxfamily.org},
  year = {2010}
}
@MISC{rayon,
  author = {{Rayon developers}},
  title = {Rayon},
  howpublished = {https://github.com/rayon-rs/rayon},
  year = {2015}
}
@article{10.1145/2382585.2382587,
  author = {{Van~Zee}, Field G. and {van~de~Geijn}, Robert A. and Quintana-Ort\'{\i}, Gregorio and Elizondo, G. Joseph},
  title = {Families of Algorithms for Reducing a Matrix to Condensed Form},
  year = {2012},
  issue_date = {November 2012},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {39},
  number = {1},
  issn = {0098-3500},
  url = {https://doi.org/10.1145/2382585.2382587},
  doi = {10.1145/2382585.2382587},
  abstract = {In a recent paper it was shown how memory traffic can be diminished by reformulating the classic algorithm for reducing a matrix to bidiagonal form, a preprocess when computing the singular values of a dense matrix. The key is a reordering of the computation so that the most memory-intensive operations can be “fused.” In this article, we show that other operations that reduce matrices to condensed form (reduction to upper Hessenberg form and reduction to tridiagonal form) can be similarly reorganized, yielding different sets of operations that can be fused. By developing the algorithms with a common framework and notation, we facilitate the comparing and contrasting of the different algorithms and opportunities for optimization on sequential architectures. We discuss the algorithms, develop a simple model to estimate the speedup potential from fusing, and showcase performance improvements consistent with the what the model predicts.},
  journal = {ACM Trans. Math. Softw.},
  month = {nov},
  articleno = {2},
  numpages = {32},
  keywords = {libraries, Hessenberg, tridiagonal, bidiagonal, high performance, reduction, Linear algebra}
}
@article{cholmod,
  author = {Chen, Yanqing and Davis, Timothy A. and Hager, William W. and Rajamanickam, Sivasankaran},
  title = {Algorithm 887: CHOLMOD, Supernodal Sparse Cholesky Factorization and Update/Downdate},
  year = {2008},
  issue_date = {October 2008},
  publisher = {Association for Computing Machinery},
  address = {New York, NY, USA},
  volume = {35},
  number = {3},
  issn = {0098-3500},
  url = {https://doi.org/10.1145/1391989.1391995},
  doi = {10.1145/1391989.1391995},
  abstract = {CHOLMOD is a set of routines for factorizing sparse symmetric positive definite matrices of the form A or AAT, updating/downdating a sparse Cholesky factorization, solving linear systems, updating/downdating the solution to the triangular system Lx = b, and many other sparse matrix functions for both symmetric and unsymmetric matrices. Its supernodal Cholesky factorization relies on LAPACK and the Level-3 BLAS, and obtains a substantial fraction of the peak performance of the BLAS. Both real and complex matrices are supported. CHOLMOD is written in ANSI/ISO C, with both C and MATLABTM interfaces. It appears in MATLAB 7.2 as x = Ab when A is sparse symmetric positive definite, as well as in several other sparse matrix functions.},
  journal = {ACM Trans. Math. Softw.},
  month = {oct},
  articleno = {22},
  numpages = {14},
  keywords = {sparse matrices, linear equations, Cholesky factorization}
}
@book{chandra2001parallel,
  title={Parallel programming in OpenMP},
  author={Chandra, Rohit and Dagum, Leo and Kohr, David and Menon, Ramesh and Maydan, Dror and McDonald, Jeff},
  year={2001},
  publisher={Morgan kaufmann}
}
@article{tbb,
  author = {Pheatt, Chuck},
  title = {Intel® Threading Building Blocks},
  year = {2008},
  issue_date = {April 2008},
  publisher = {Consortium for Computing Sciences in Colleges},
  address = {Evansville, IN, USA},
  volume = {23},
  number = {4},
  issn = {1937-4771},
  doi = {10.1016/b978-0-12-803761-4.00011-3},
  abstract = {Intel® Threading Building Blocks [1] is a C++ runtime library that abstracts the low-level threading details necessary for effectively utilizing multi-core processors. It uses C++ templates to eliminate the need to create and manage threads. Applications tend to be more portable since parallelism is achieved through library calls and utilization of a task manager for scheduling. The task manager analyzes the system the software is running on, chooses the optimal number of threads, and performs load balancing that spreads out the work evenly across all processor cores. The library consists of data structures and algorithms that simplify parallel programming in C++ by avoiding requiring a programmer to use native threading packages such as POSIX threads or Windows threads, or even the portable Boost Threads.},
  journal = {J. Comput. Sci. Coll.},
  month = {apr},
  pages = {298},
  numpages = {1}
}