// Copyright (c) 2010-2023, Lawrence Livermore National Security, LLC. Produced // at the Lawrence Livermore National Laboratory. All Rights reserved. See files // LICENSE and NOTICE for details. LLNL-CODE-806117. // // This file is part of the MFEM library. For more information and source code // availability visit https://mfem.org. // // MFEM is free software; you can redistribute it and/or modify it under the // terms of the BSD-3 license. We welcome feedback and contributions, see file // CONTRIBUTING.md for details. #ifndef MFEM_LINALG_KERNELS_HPP #define MFEM_LINALG_KERNELS_HPP #include "../config/config.hpp" #include "../general/backends.hpp" #include "../general/globals.hpp" #include "matrix.hpp" #include "tmatrix.hpp" #include "tlayout.hpp" #include "ttensor.hpp" // This header contains stand-alone functions for "small" dense linear algebra // (at quadrature point or element-level) designed to be inlined directly into // device kernels. // Many methods of the DenseMatrix class and some of the Vector class call these // kernels directly on the host, see the implementations in linalg/densemat.cpp // and linalg/vector.cpp. namespace mfem { namespace kernels { /// Compute the square of the Euclidean distance to another vector template MFEM_HOST_DEVICE inline double DistanceSquared(const double *x, const double *y) { double d = 0.0; for (int i = 0; i < dim; i++) { d += (x[i]-y[i])*(x[i]-y[i]); } return d; } /// Creates n x n diagonal matrix with diagonal elements c template MFEM_HOST_DEVICE inline void Diag(const double c, double *data) { const int N = dim*dim; for (int i = 0; i < N; i++) { data[i] = 0.0; } for (int i = 0; i < dim; i++) { data[i*(dim+1)] = c; } } /// Vector subtraction operation: z = a * (x - y) template MFEM_HOST_DEVICE inline void Subtract(const double a, const double *x, const double *y, double *z) { for (int i = 0; i < dim; i++) { z[i] = a * (x[i] - y[i]); } } /// Dense matrix operation: VWt += v w^t template MFEM_HOST_DEVICE inline void AddMultVWt(const double *v, const double *w, double *VWt) { for (int i = 0; i < dim; i++) { const double vi = v[i]; for (int j = 0; j < dim; j++) { VWt[i*dim+j] += vi * w[j]; } } } template MFEM_HOST_DEVICE inline void FNorm(double &scale_factor, double &scaled_fnorm2, const T *data) { int i, hw = H * W; T max_norm = 0.0, entry, fnorm2; for (i = 0; i < hw; i++) { entry = fabs(data[i]); if (entry > max_norm) { max_norm = entry; } } if (max_norm == 0.0) { scale_factor = scaled_fnorm2 = 0.0; return; } fnorm2 = 0.0; for (i = 0; i < hw; i++) { entry = data[i] / max_norm; fnorm2 += entry * entry; } scale_factor = max_norm; scaled_fnorm2 = fnorm2; } /// Compute the Frobenius norm of the matrix template MFEM_HOST_DEVICE inline double FNorm(const T *data) { double s, n2; kernels::FNorm(s, n2, data); return s*sqrt(n2); } /// Compute the square of the Frobenius norm of the matrix template MFEM_HOST_DEVICE inline double FNorm2(const T *data) { double s, n2; kernels::FNorm(s, n2, data); return s*s*n2; } /// Returns the l2 norm of the Vector with given @a size and @a data. template MFEM_HOST_DEVICE inline double Norml2(const int size, const T *data) { if (0 == size) { return 0.0; } if (1 == size) { return std::abs(data[0]); } T scale = 0.0; T sum = 0.0; for (int i = 0; i < size; i++) { if (data[i] != 0.0) { const T absdata = fabs(data[i]); if (scale <= absdata) { const T sqr_arg = scale / absdata; sum = 1.0 + sum * (sqr_arg * sqr_arg); scale = absdata; continue; } // end if scale <= absdata const T sqr_arg = absdata / scale; sum += (sqr_arg * sqr_arg); // else scale > absdata } // end if data[i] != 0 } return scale * sqrt(sum); } /** @brief Matrix vector multiplication: y = A x, where the matrix A is of size @a height x @a width with given @a data, while @a x and @a y specify the data of the input and output vectors. */ template MFEM_HOST_DEVICE inline void Mult(const int height, const int width, const TA *data, const TX *x, TY *y) { if (width == 0) { for (int row = 0; row < height; row++) { y[row] = 0.0; } return; } const TA *d_col = data; TX x_col = x[0]; for (int row = 0; row < height; row++) { y[row] = x_col*d_col[row]; } d_col += height; for (int col = 1; col < width; col++) { x_col = x[col]; for (int row = 0; row < height; row++) { y[row] += x_col*d_col[row]; } d_col += height; } } /** @brief Matrix transpose vector multiplication: y = At x, where the matrix A is of size @a height x @a width with given @a data, while @a x and @a y specify the data of the input and output vectors. */ template MFEM_HOST_DEVICE inline void MultTranspose(const int height, const int width, const TA *data, const TX *x, TY *y) { if (height == 0) { for (int row = 0; row < width; row++) { y[row] = 0.0; } return; } TY *y_off = y; for (int i = 0; i < width; ++i) { TY val = 0.0; for (int j = 0; j < height; ++j) { val += x[j] * data[i * height + j]; } *y_off = val; y_off++; } } /// Symmetrize a square matrix with given @a size and @a data: A -> (A+A^T)/2. template MFEM_HOST_DEVICE inline void Symmetrize(const int size, T *data) { for (int i = 0; i < size; i++) { for (int j = 0; j < i; j++) { const T a = 0.5 * (data[i*size+j] + data[j*size+i]); data[j*size+i] = data[i*size+j] = a; } } } /// Compute the determinant of a square matrix of size dim with given @a data. template MFEM_HOST_DEVICE inline T Det(const T *data) { return TDetHD(ColumnMajorLayout2D(), data); } /** @brief Return the inverse of a matrix with given @a size and @a data into the matrix with data @a inv_data. */ template MFEM_HOST_DEVICE inline void CalcInverse(const T *data, T *inv_data) { typedef ColumnMajorLayout2D layout_t; const T det = TAdjDetHD(layout_t(), data, layout_t(), inv_data); TAssignHD(layout_t(), inv_data, static_cast(1.0)/det); } /** @brief Return the adjugate of a matrix */ template MFEM_HOST_DEVICE inline void CalcAdjugate(const T *data, T *adj_data) { typedef ColumnMajorLayout2D layout_t; TAdjugateHD(layout_t(), data, layout_t(), adj_data); } /** @brief Compute C = A + alpha*B, where the matrices A, B and C are of size @a height x @a width with data @a Adata, @a Bdata and @a Cdata. */ template MFEM_HOST_DEVICE inline void Add(const int height, const int width, const TALPHA alpha, const TA *Adata, const TB *Bdata, TC *Cdata) { for (int j = 0; j < width; j++) { for (int i = 0; i < height; i++) { const int n = i*width+j; Cdata[n] = Adata[n] + alpha * Bdata[n]; } } } /** @brief Compute C = alpha*A + beta*B, where the matrices A, B and C are of size @a height x @a width with data @a Adata, @a Bdata and @a Cdata. */ template MFEM_HOST_DEVICE inline void Add(const int height, const int width, const TALPHA alpha, const TA *Adata, const TBETA beta, const TB *Bdata, TC *Cdata) { const int m = height * width; for (int i = 0; i < m; i++) { Cdata[i] = alpha * Adata[i] + beta * Bdata[i]; } } /** @brief Compute B += A, where the matrices A and B are of size @a height x @a width with data @a Adata and @a Bdata. */ template MFEM_HOST_DEVICE inline void Add(const int height, const int width, const TA *Adata, TB *Bdata) { const int m = height * width; for (int i = 0; i < m; i++) { Bdata[i] += Adata[i]; } } /** @brief Compute B +=alpha*A, where the matrices A and B are of size @a height x @a width with data @a Adata and @a Bdata. */ template MFEM_HOST_DEVICE inline void Add(const int height, const int width, const double alpha, const TA *Adata, TB *Bdata) { const int m = height * width; for (int i = 0; i < m; i++) { Bdata[i] += alpha * Adata[i]; } } /** @brief Compute B = alpha*A, where the matrices A and B are of size @a height x @a width with data @a Adata and @a Bdata. */ template MFEM_HOST_DEVICE inline void Set(const int height, const int width, const double alpha, const TA *Adata, TB *Bdata) { const int m = height * width; for (int i = 0; i < m; i++) { Bdata[i] = alpha * Adata[i]; } } /** @brief Matrix-matrix multiplication: A = B * C, where the matrices A, B and C are of sizes @a Aheight x @a Awidth, @a Aheight x @a Bwidth and @a Bwidth x @a Awidth, respectively. */ template MFEM_HOST_DEVICE inline void Mult(const int Aheight, const int Awidth, const int Bwidth, const TB *Bdata, const TC *Cdata, TA *Adata) { const int ah_x_aw = Aheight * Awidth; for (int i = 0; i < ah_x_aw; i++) { Adata[i] = 0.0; } for (int j = 0; j < Awidth; j++) { for (int k = 0; k < Bwidth; k++) { for (int i = 0; i < Aheight; i++) { Adata[i+j*Aheight] += Bdata[i+k*Aheight] * Cdata[k+j*Bwidth]; } } } } /** @brief Multiply a matrix of size @a Aheight x @a Awidth and data @a Adata with the transpose of a matrix of size @a Bheight x @a Awidth and data @a Bdata: A * Bt. Return the result in a matrix with data @a ABtdata. */ template MFEM_HOST_DEVICE inline void MultABt(const int Aheight, const int Awidth, const int Bheight, const TA *Adata, const TB *Bdata, TC *ABtdata) { const int ah_x_bh = Aheight * Bheight; for (int i = 0; i < ah_x_bh; i++) { ABtdata[i] = 0.0; } for (int k = 0; k < Awidth; k++) { TC *c = ABtdata; for (int j = 0; j < Bheight; j++) { const double bjk = Bdata[j]; for (int i = 0; i < Aheight; i++) { c[i] += Adata[i] * bjk; } c += Aheight; } Adata += Aheight; Bdata += Bheight; } } /** @brief Multiply the transpose of a matrix of size @a Aheight x @a Awidth and data @a Adata with a matrix of size @a Aheight x @a Bwidth and data @a Bdata: At * B. Return the result in a matrix with data @a AtBdata. */ template MFEM_HOST_DEVICE inline void MultAtB(const int Aheight, const int Awidth, const int Bwidth, const TA *Adata, const TB *Bdata, TC *AtBdata) { TC *c = AtBdata; for (int i = 0; i < Bwidth; ++i) { for (int j = 0; j < Awidth; ++j) { TC val = 0.0; for (int k = 0; k < Aheight; ++k) { val += Adata[j * Aheight + k] * Bdata[i * Aheight + k]; } *c = val; c++; } } } /// Given a matrix of size 2x1, 3x1, or 3x2, compute the left inverse. template MFEM_HOST_DEVICE void CalcLeftInverse(const double *data, double *left_inv); /// Compute the spectrum of the matrix of size dim with given @a data, returning /// the eigenvalues in the array @a lambda and the eigenvectors in the array @a /// vec (listed consecutively). template MFEM_HOST_DEVICE void CalcEigenvalues(const double *data, double *lambda, double *vec); /// Return the i'th singular value of the matrix of size dim with given @a data. template MFEM_HOST_DEVICE double CalcSingularvalue(const double *data, const int i); // Utility functions for CalcEigenvalues and CalcSingularvalue namespace internal { /// Utility function to swap the values of @a a and @a b. template MFEM_HOST_DEVICE static inline void Swap(T &a, T &b) { T tmp = a; a = b; b = tmp; } const double Epsilon = std::numeric_limits::epsilon(); /// Utility function used in CalcSingularvalue<3>. MFEM_HOST_DEVICE static inline void Eigenvalues2S(const double &d12, double &d1, double &d2) { const double sqrt_1_eps = sqrt(1./Epsilon); if (d12 != 0.) { // "The Symmetric Eigenvalue Problem", B. N. Parlett, pp.189-190 double t; const double zeta = (d2 - d1)/(2*d12); // inf/inf from overflows? if (fabs(zeta) < sqrt_1_eps) { t = d12*copysign(1./(fabs(zeta) + sqrt(1. + zeta*zeta)), zeta); } else { t = d12*copysign(0.5/fabs(zeta), zeta); } d1 -= t; d2 += t; } } /// Utility function used in CalcEigenvalues(). MFEM_HOST_DEVICE static inline void Eigensystem2S(const double &d12, double &d1, double &d2, double &c, double &s) { const double sqrt_1_eps = sqrt(1./Epsilon); if (d12 == 0.0) { c = 1.; s = 0.; } else { // "The Symmetric Eigenvalue Problem", B. N. Parlett, pp.189-190 double t; const double zeta = (d2 - d1)/(2*d12); const double azeta = fabs(zeta); if (azeta < sqrt_1_eps) { t = copysign(1./(azeta + sqrt(1. + zeta*zeta)), zeta); } else { t = copysign(0.5/azeta, zeta); } c = sqrt(1./(1. + t*t)); s = c*t; t *= d12; d1 -= t; d2 += t; } } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline void GetScalingFactor(const double &d_max, double &mult) { int d_exp; if (d_max > 0.) { mult = frexp(d_max, &d_exp); if (d_exp == std::numeric_limits::max_exponent) { mult *= std::numeric_limits::radix; } mult = d_max/mult; } else { mult = 1.; } // mult = 2^d_exp is such that d_max/mult is in [0.5,1) or in other words // d_max is in the interval [0.5,1)*mult } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline bool KernelVector2G(const int &mode, double &d1, double &d12, double &d21, double &d2) { // Find a vector (z1,z2) in the "near"-kernel of the matrix // | d1 d12 | // | d21 d2 | // using QR factorization. // The vector (z1,z2) is returned in (d1,d2). Return 'true' if the matrix // is zero without setting (d1,d2). // Note: in the current implementation |z1| + |z2| = 1. // l1-norms of the columns double n1 = fabs(d1) + fabs(d21); double n2 = fabs(d2) + fabs(d12); bool swap_columns = (n2 > n1); double mu; if (!swap_columns) { if (n1 == 0.) { return true; } if (mode == 0) // eliminate the larger entry in the column { if (fabs(d1) > fabs(d21)) { Swap(d1, d21); Swap(d12, d2); } } else // eliminate the smaller entry in the column { if (fabs(d1) < fabs(d21)) { Swap(d1, d21); Swap(d12, d2); } } } else { // n2 > n1, swap columns 1 and 2 if (mode == 0) // eliminate the larger entry in the column { if (fabs(d12) > fabs(d2)) { Swap(d1, d2); Swap(d12, d21); } else { Swap(d1, d12); Swap(d21, d2); } } else // eliminate the smaller entry in the column { if (fabs(d12) < fabs(d2)) { Swap(d1, d2); Swap(d12, d21); } else { Swap(d1, d12); Swap(d21, d2); } } } n1 = hypot(d1, d21); if (d21 != 0.) { // v = (n1, n2)^t, |v| = 1 // Q = I - 2 v v^t, Q (d1, d21)^t = (mu, 0)^t mu = copysign(n1, d1); n1 = -d21*(d21/(d1 + mu)); // = d1 - mu d1 = mu; // normalize (n1,d21) to avoid overflow/underflow // normalize (n1,d21) by the max-norm to avoid the sqrt call if (fabs(n1) <= fabs(d21)) { // (n1,n2) <-- (n1/d21,1) n1 = n1/d21; mu = (2./(1. + n1*n1))*(n1*d12 + d2); d2 = d2 - mu; d12 = d12 - mu*n1; } else { // (n1,n2) <-- (1,d21/n1) n2 = d21/n1; mu = (2./(1. + n2*n2))*(d12 + n2*d2); d2 = d2 - mu*n2; d12 = d12 - mu; } } // Solve: // | d1 d12 | | z1 | = | 0 | // | 0 d2 | | z2 | | 0 | // choose (z1,z2) to minimize |d1*z1 + d12*z2| + |d2*z2| // under the condition |z1| + |z2| = 1, z2 >= 0 (for uniqueness) // set t = z1, z2 = 1 - |t|, -1 <= t <= 1 // objective function is: // |d1*t + d12*(1 - |t|)| + |d2|*(1 - |t|) -- piecewise linear with // possible minima are -1,0,1,t1 where t1: d1*t1 + d12*(1 - |t1|) = 0 // values: @t=+/-1 -> |d1|, @t=0 -> |n1| + |d2|, @t=t1 -> |d2|*(1 - |t1|) // evaluate z2 @t=t1 mu = -d12/d1; // note: |mu| <= 1, if using l2-norm for column pivoting // |mu| <= sqrt(2), if using l1-norm n2 = 1./(1. + fabs(mu)); // check if |d1|<=|d2|*z2 if (fabs(d1) <= n2*fabs(d2)) { d2 = 0.; d1 = 1.; } else { d2 = n2; // d1 = (n2 < 0.5) ? copysign(1. - n2, mu) : mu*n2; d1 = mu*n2; } if (swap_columns) { Swap(d1, d2); } return false; } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline void Vec_normalize3_aux(const double &x1, const double &x2, const double &x3, double &n1, double &n2, double &n3) { double t, r; const double m = fabs(x1); r = x2/m; t = 1. + r*r; r = x3/m; t = sqrt(1./(t + r*r)); n1 = copysign(t, x1); t /= m; n2 = x2*t; n3 = x3*t; } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline void Vec_normalize3(const double &x1, const double &x2, const double &x3, double &n1, double &n2, double &n3) { // should work ok when xk is the same as nk for some or all k if (fabs(x1) >= fabs(x2)) { if (fabs(x1) >= fabs(x3)) { if (x1 != 0.) { Vec_normalize3_aux(x1, x2, x3, n1, n2, n3); } else { n1 = n2 = n3 = 0.; } return; } } else if (fabs(x2) >= fabs(x3)) { Vec_normalize3_aux(x2, x1, x3, n2, n1, n3); return; } Vec_normalize3_aux(x3, x1, x2, n3, n1, n2); } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline int KernelVector3G_aux(const int &mode, double &d1, double &d2, double &d3, double &c12, double &c13, double &c23, double &c21, double &c31, double &c32) { int kdim; double mu, n1, n2, n3, s1, s2, s3; s1 = hypot(c21, c31); n1 = hypot(d1, s1); if (s1 != 0.) { // v = (s1, s2, s3)^t, |v| = 1 // Q = I - 2 v v^t, Q (d1, c12, c13)^t = (mu, 0, 0)^t mu = copysign(n1, d1); n1 = -s1*(s1/(d1 + mu)); // = d1 - mu d1 = mu; // normalize (n1,c21,c31) to avoid overflow/underflow // normalize (n1,c21,c31) by the max-norm to avoid the sqrt call if (fabs(n1) >= fabs(c21)) { if (fabs(n1) >= fabs(c31)) { // n1 is max, (s1,s2,s3) <-- (1,c21/n1,c31/n1) s2 = c21/n1; s3 = c31/n1; mu = 2./(1. + s2*s2 + s3*s3); n2 = mu*(c12 + s2*d2 + s3*c32); n3 = mu*(c13 + s2*c23 + s3*d3); c12 = c12 - n2; d2 = d2 - s2*n2; c32 = c32 - s3*n2; c13 = c13 - n3; c23 = c23 - s2*n3; d3 = d3 - s3*n3; goto done_column_1; } } else if (fabs(c21) >= fabs(c31)) { // c21 is max, (s1,s2,s3) <-- (n1/c21,1,c31/c21) s1 = n1/c21; s3 = c31/c21; mu = 2./(1. + s1*s1 + s3*s3); n2 = mu*(s1*c12 + d2 + s3*c32); n3 = mu*(s1*c13 + c23 + s3*d3); c12 = c12 - s1*n2; d2 = d2 - n2; c32 = c32 - s3*n2; c13 = c13 - s1*n3; c23 = c23 - n3; d3 = d3 - s3*n3; goto done_column_1; } // c31 is max, (s1,s2,s3) <-- (n1/c31,c21/c31,1) s1 = n1/c31; s2 = c21/c31; mu = 2./(1. + s1*s1 + s2*s2); n2 = mu*(s1*c12 + s2*d2 + c32); n3 = mu*(s1*c13 + s2*c23 + d3); c12 = c12 - s1*n2; d2 = d2 - s2*n2; c32 = c32 - n2; c13 = c13 - s1*n3; c23 = c23 - s2*n3; d3 = d3 - n3; } done_column_1: // Solve: // | d2 c23 | | z2 | = | 0 | // | c32 d3 | | z3 | | 0 | if (KernelVector2G(mode, d2, c23, c32, d3)) { // Have two solutions: // two vectors in the kernel are P (-c12/d1, 1, 0)^t and // P (-c13/d1, 0, 1)^t where P is the permutation matrix swapping // entries 1 and col. // A vector orthogonal to both these vectors is P (1, c12/d1, c13/d1)^t d2 = c12/d1; d3 = c13/d1; d1 = 1.; kdim = 2; } else { // solve for z1: // note: |z1| <= a since |z2| + |z3| = 1, and // max{|c12|,|c13|} <= max{norm(col. 2),norm(col. 3)} // <= norm(col. 1) <= a |d1| // a = 1, if using l2-norm for column pivoting // a = sqrt(3), if using l1-norm d1 = -(c12*d2 + c13*d3)/d1; kdim = 1; } Vec_normalize3(d1, d2, d3, d1, d2, d3); return kdim; } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline int KernelVector3S(const int &mode, const double &d12, const double &d13, const double &d23, double &d1, double &d2, double &d3) { // Find a unit vector (z1,z2,z3) in the "near"-kernel of the matrix // | d1 d12 d13 | // | d12 d2 d23 | // | d13 d23 d3 | // using QR factorization. // The vector (z1,z2,z3) is returned in (d1,d2,d3). // Returns the dimension of the kernel, kdim, but never zero. // - if kdim == 3, then (d1,d2,d3) is not defined, // - if kdim == 2, then (d1,d2,d3) is a vector orthogonal to the kernel, // - otherwise kdim == 1 and (d1,d2,d3) is a vector in the "near"-kernel. double c12 = d12, c13 = d13, c23 = d23; double c21, c31, c32; int col, row; // l1-norms of the columns: c32 = fabs(d1) + fabs(c12) + fabs(c13); c31 = fabs(d2) + fabs(c12) + fabs(c23); c21 = fabs(d3) + fabs(c13) + fabs(c23); // column pivoting: choose the column with the largest norm if (c32 >= c21) { col = (c32 >= c31) ? 1 : 2; } else { col = (c31 >= c21) ? 2 : 3; } switch (col) { case 1: if (c32 == 0.) // zero matrix { return 3; } break; case 2: if (c31 == 0.) // zero matrix { return 3; } Swap(c13, c23); Swap(d1, d2); break; case 3: if (c21 == 0.) // zero matrix { return 3; } Swap(c12, c23); Swap(d1, d3); } // row pivoting depending on 'mode' if (mode == 0) { if (fabs(d1) <= fabs(c13)) { row = (fabs(d1) <= fabs(c12)) ? 1 : 2; } else { row = (fabs(c12) <= fabs(c13)) ? 2 : 3; } } else { if (fabs(d1) >= fabs(c13)) { row = (fabs(d1) >= fabs(c12)) ? 1 : 2; } else { row = (fabs(c12) >= fabs(c13)) ? 2 : 3; } } switch (row) { case 1: c21 = c12; c31 = c13; c32 = c23; break; case 2: c21 = d1; c31 = c13; c32 = c23; d1 = c12; c12 = d2; d2 = d1; c13 = c23; c23 = c31; break; case 3: c21 = c12; c31 = d1; c32 = c12; d1 = c13; c12 = c23; c13 = d3; d3 = d1; } row = KernelVector3G_aux(mode, d1, d2, d3, c12, c13, c23, c21, c31, c32); // row is kdim switch (col) { case 2: Swap(d1, d2); break; case 3: Swap(d1, d3); } return row; } /// Utility function used in CalcEigenvalues<3>. MFEM_HOST_DEVICE static inline int Reduce3S(const int &mode, double &d1, double &d2, double &d3, double &d12, double &d13, double &d23, double &z1, double &z2, double &z3, double &v1, double &v2, double &v3, double &g) { // Given the matrix // | d1 d12 d13 | // A = | d12 d2 d23 | // | d13 d23 d3 | // and a unit eigenvector z=(z1,z2,z3), transform the matrix A into the // matrix B = Q P A P Q that has the form // | b1 0 0 | // B = Q P A P Q = | 0 b2 b23 | // | 0 b23 b3 | // where P is the permutation matrix switching entries 1 and k, and // Q is the reflection matrix Q = I - g v v^t, defined by: set y = P z and // v = c(y - e_1); if y = e_1, then v = 0 and Q = I. // Note: Q y = e_1, Q e_1 = y ==> Q P A P Q e_1 = ... = lambda e_1. // The entries (b1,b2,b3,b23) are returned in (d1,d2,d3,d23), and the // return value of the function is k. The variable g = 2/(v1^2+v2^2+v3^3). int k; double s, w1, w2, w3; if (mode == 0) { // choose k such that z^t e_k = zk has the smallest absolute value, i.e. // the angle between z and e_k is closest to pi/2 if (fabs(z1) <= fabs(z3)) { k = (fabs(z1) <= fabs(z2)) ? 1 : 2; } else { k = (fabs(z2) <= fabs(z3)) ? 2 : 3; } } else { // choose k such that zk is the largest by absolute value if (fabs(z1) >= fabs(z3)) { k = (fabs(z1) >= fabs(z2)) ? 1 : 2; } else { k = (fabs(z2) >= fabs(z3)) ? 2 : 3; } } switch (k) { case 2: Swap(d13, d23); Swap(d1, d2); Swap(z1, z2); break; case 3: Swap(d12, d23); Swap(d1, d3); Swap(z1, z3); } s = hypot(z2, z3); if (s == 0.) { // s can not be zero, if zk is the smallest (mode == 0) v1 = v2 = v3 = 0.; g = 1.; } else { g = copysign(1., z1); v1 = -s*(s/(z1 + g)); // = z1 - g // normalize (v1,z2,z3) by its max-norm, avoiding the sqrt call g = fabs(v1); if (fabs(z2) > g) { g = fabs(z2); } if (fabs(z3) > g) { g = fabs(z3); } v1 = v1/g; v2 = z2/g; v3 = z3/g; g = 2./(v1*v1 + v2*v2 + v3*v3); // Compute Q A Q = A - v w^t - w v^t, where // w = u - (g/2)(v^t u) v, and u = g A v // set w = g A v w1 = g*( d1*v1 + d12*v2 + d13*v3); w2 = g*(d12*v1 + d2*v2 + d23*v3); w3 = g*(d13*v1 + d23*v2 + d3*v3); // w := w - (g/2)(v^t w) v s = (g/2)*(v1*w1 + v2*w2 + v3*w3); w1 -= s*v1; w2 -= s*v2; w3 -= s*v3; // dij -= vi*wj + wi*vj d1 -= 2*v1*w1; d2 -= 2*v2*w2; d23 -= v2*w3 + v3*w2; d3 -= 2*v3*w3; // compute the off-diagonal entries on the first row/column of B which // should be zero (for debugging): #if 0 s = d12 - v1*w2 - v2*w1; // b12 = 0 s = d13 - v1*w3 - v3*w1; // b13 = 0 #endif } switch (k) { case 2: Swap(z1, z2); break; case 3: Swap(z1, z3); } return k; } } // namespace kernels::internal // Implementations of CalcLeftInverse for dim = 1, 2. template<> MFEM_HOST_DEVICE inline void CalcLeftInverse<2,1>(const double *d, double *left_inv) { const double t = 1.0 / (d[0]*d[0] + d[1]*d[1]); left_inv[0] = d[0] * t; left_inv[1] = d[1] * t; } template<> MFEM_HOST_DEVICE inline void CalcLeftInverse<3,1>(const double *d, double *left_inv) { const double t = 1.0 / (d[0]*d[0] + d[1]*d[1] + d[2]*d[2]); left_inv[0] = d[0] * t; left_inv[1] = d[1] * t; left_inv[2] = d[2] * t; } template<> MFEM_HOST_DEVICE inline void CalcLeftInverse<3,2>(const double *d, double *left_inv) { double e = d[0]*d[0] + d[1]*d[1] + d[2]*d[2]; double g = d[3]*d[3] + d[4]*d[4] + d[5]*d[5]; double f = d[0]*d[3] + d[1]*d[4] + d[2]*d[5]; const double t = 1.0 / (e*g - f*f); e *= t; g *= t; f *= t; left_inv[0] = d[0]*g - d[3]*f; left_inv[1] = d[3]*e - d[0]*f; left_inv[2] = d[1]*g - d[4]*f; left_inv[3] = d[4]*e - d[1]*f; left_inv[4] = d[2]*g - d[5]*f; left_inv[5] = d[5]*e - d[2]*f; } // Implementations of CalcEigenvalues and CalcSingularvalue for dim = 2, 3. /// Compute the spectrum of the matrix of size 2 with given @a data, returning /// the eigenvalues in the array @a lambda and the eigenvectors in the array @a /// vec (listed consecutively). template<> MFEM_HOST_DEVICE inline void CalcEigenvalues<2>(const double *data, double *lambda, double *vec) { double d0 = data[0]; double d2 = data[2]; // use the upper triangular entry double d3 = data[3]; double c, s; internal::Eigensystem2S(d2, d0, d3, c, s); if (d0 <= d3) { lambda[0] = d0; lambda[1] = d3; vec[0] = c; vec[1] = -s; vec[2] = s; vec[3] = c; } else { lambda[0] = d3; lambda[1] = d0; vec[0] = s; vec[1] = c; vec[2] = c; vec[3] = -s; } } /// Compute the spectrum of the matrix of size 3 with given @a data, returning /// the eigenvalues in the array @a lambda and the eigenvectors in the array @a /// vec (listed consecutively). template<> MFEM_HOST_DEVICE inline void CalcEigenvalues<3>(const double *data, double *lambda, double *vec) { double d11 = data[0]; double d12 = data[3]; // use the upper triangular entries double d22 = data[4]; double d13 = data[6]; double d23 = data[7]; double d33 = data[8]; double mult; { double d_max = fabs(d11); if (d_max < fabs(d22)) { d_max = fabs(d22); } if (d_max < fabs(d33)) { d_max = fabs(d33); } if (d_max < fabs(d12)) { d_max = fabs(d12); } if (d_max < fabs(d13)) { d_max = fabs(d13); } if (d_max < fabs(d23)) { d_max = fabs(d23); } internal::GetScalingFactor(d_max, mult); } d11 /= mult; d22 /= mult; d33 /= mult; d12 /= mult; d13 /= mult; d23 /= mult; double aa = (d11 + d22 + d33)/3; // aa = tr(A)/3 double c1 = d11 - aa; double c2 = d22 - aa; double c3 = d33 - aa; double Q, R; Q = (2*(d12*d12 + d13*d13 + d23*d23) + c1*c1 + c2*c2 + c3*c3)/6; R = (c1*(d23*d23 - c2*c3)+ d12*(d12*c3 - 2*d13*d23) + d13*d13*c2)/2; if (Q <= 0.) { lambda[0] = lambda[1] = lambda[2] = aa; vec[0] = 1.; vec[3] = 0.; vec[6] = 0.; vec[1] = 0.; vec[4] = 1.; vec[7] = 0.; vec[2] = 0.; vec[5] = 0.; vec[8] = 1.; } else { double sqrtQ = sqrt(Q); double sqrtQ3 = Q*sqrtQ; // double sqrtQ3 = sqrtQ*sqrtQ*sqrtQ; // double sqrtQ3 = pow(Q, 1.5); double r; if (fabs(R) >= sqrtQ3) { if (R < 0.) { // R = -1.; r = 2*sqrtQ; } else { // R = 1.; r = -2*sqrtQ; } } else { R = R/sqrtQ3; if (R < 0.) { r = -2*sqrtQ*cos((acos(R) + 2.0*M_PI)/3); // max } else { r = -2*sqrtQ*cos(acos(R)/3); // min } } aa += r; c1 = d11 - aa; c2 = d22 - aa; c3 = d33 - aa; // Type of Householder reflections: z --> mu ek, where k is the index // of the entry in z with: // mode == 0: smallest absolute value --> angle closest to pi/2 // mode == 1: largest absolute value --> angle farthest from pi/2 // Observations: // mode == 0 produces better eigenvectors, less accurate eigenvalues? // mode == 1 produces better eigenvalues, less accurate eigenvectors? const int mode = 0; // Find a unit vector z = (z1,z2,z3) in the "near"-kernel of // | c1 d12 d13 | // | d12 c2 d23 | = A - aa*I // | d13 d23 c3 | // This vector is also an eigenvector for A corresponding to aa. // The vector z overwrites (c1,c2,c3). switch (internal::KernelVector3S(mode, d12, d13, d23, c1, c2, c3)) { case 3: // 'aa' is a triple eigenvalue lambda[0] = lambda[1] = lambda[2] = aa; vec[0] = 1.; vec[3] = 0.; vec[6] = 0.; vec[1] = 0.; vec[4] = 1.; vec[7] = 0.; vec[2] = 0.; vec[5] = 0.; vec[8] = 1.; goto done_3d; case 2: // ok, continue with the returned vector orthogonal to the kernel case 1: // ok, continue with the returned vector in the "near"-kernel ; } // Using the eigenvector c=(c1,c2,c3) transform A into // | d11 0 0 | // A <-- Q P A P Q = | 0 d22 d23 | // | 0 d23 d33 | double v1, v2, v3, g; int k = internal::Reduce3S(mode, d11, d22, d33, d12, d13, d23, c1, c2, c3, v1, v2, v3, g); // Q = I - 2 v v^t // P - permutation matrix switching entries 1 and k // find the eigenvalues and eigenvectors for // | d22 d23 | // | d23 d33 | double c, s; internal::Eigensystem2S(d23, d22, d33, c, s); // d22 <-> P Q (0, c, -s), d33 <-> P Q (0, s, c) double *vec_1, *vec_2, *vec_3; if (d11 <= d22) { if (d22 <= d33) { lambda[0] = d11; vec_1 = vec; lambda[1] = d22; vec_2 = vec + 3; lambda[2] = d33; vec_3 = vec + 6; } else if (d11 <= d33) { lambda[0] = d11; vec_1 = vec; lambda[1] = d33; vec_3 = vec + 3; lambda[2] = d22; vec_2 = vec + 6; } else { lambda[0] = d33; vec_3 = vec; lambda[1] = d11; vec_1 = vec + 3; lambda[2] = d22; vec_2 = vec + 6; } } else { if (d11 <= d33) { lambda[0] = d22; vec_2 = vec; lambda[1] = d11; vec_1 = vec + 3; lambda[2] = d33; vec_3 = vec + 6; } else if (d22 <= d33) { lambda[0] = d22; vec_2 = vec; lambda[1] = d33; vec_3 = vec + 3; lambda[2] = d11; vec_1 = vec + 6; } else { lambda[0] = d33; vec_3 = vec; lambda[1] = d22; vec_2 = vec + 3; lambda[2] = d11; vec_1 = vec + 6; } } vec_1[0] = c1; vec_1[1] = c2; vec_1[2] = c3; d22 = g*(v2*c - v3*s); d33 = g*(v2*s + v3*c); vec_2[0] = - v1*d22; vec_3[0] = - v1*d33; vec_2[1] = c - v2*d22; vec_3[1] = s - v2*d33; vec_2[2] = -s - v3*d22; vec_3[2] = c - v3*d33; switch (k) { case 2: internal::Swap(vec_2[0], vec_2[1]); internal::Swap(vec_3[0], vec_3[1]); break; case 3: internal::Swap(vec_2[0], vec_2[2]); internal::Swap(vec_3[0], vec_3[2]); } } done_3d: lambda[0] *= mult; lambda[1] *= mult; lambda[2] *= mult; } /// Return the i'th singular value of the matrix of size 2 with given @a data. template<> MFEM_HOST_DEVICE inline double CalcSingularvalue<2>(const double *data, const int i) { double d0, d1, d2, d3; d0 = data[0]; d1 = data[1]; d2 = data[2]; d3 = data[3]; double mult; { double d_max = fabs(d0); if (d_max < fabs(d1)) { d_max = fabs(d1); } if (d_max < fabs(d2)) { d_max = fabs(d2); } if (d_max < fabs(d3)) { d_max = fabs(d3); } internal::GetScalingFactor(d_max, mult); } d0 /= mult; d1 /= mult; d2 /= mult; d3 /= mult; double t = 0.5*((d0+d2)*(d0-d2)+(d1-d3)*(d1+d3)); double s = d0*d2 + d1*d3; s = sqrt(0.5*(d0*d0 + d1*d1 + d2*d2 + d3*d3) + sqrt(t*t + s*s)); if (s == 0.0) { return 0.0; } t = fabs(d0*d3 - d1*d2) / s; if (t > s) { if (i == 0) { return t*mult; } return s*mult; } if (i == 0) { return s*mult; } return t*mult; } /// Return the i'th singular value of the matrix of size 3 with given @a data. template<> MFEM_HOST_DEVICE inline double CalcSingularvalue<3>(const double *data, const int i) { double d0, d1, d2, d3, d4, d5, d6, d7, d8; d0 = data[0]; d3 = data[3]; d6 = data[6]; d1 = data[1]; d4 = data[4]; d7 = data[7]; d2 = data[2]; d5 = data[5]; d8 = data[8]; double mult; { double d_max = fabs(d0); if (d_max < fabs(d1)) { d_max = fabs(d1); } if (d_max < fabs(d2)) { d_max = fabs(d2); } if (d_max < fabs(d3)) { d_max = fabs(d3); } if (d_max < fabs(d4)) { d_max = fabs(d4); } if (d_max < fabs(d5)) { d_max = fabs(d5); } if (d_max < fabs(d6)) { d_max = fabs(d6); } if (d_max < fabs(d7)) { d_max = fabs(d7); } if (d_max < fabs(d8)) { d_max = fabs(d8); } internal::GetScalingFactor(d_max, mult); } d0 /= mult; d1 /= mult; d2 /= mult; d3 /= mult; d4 /= mult; d5 /= mult; d6 /= mult; d7 /= mult; d8 /= mult; double b11 = d0*d0 + d1*d1 + d2*d2; double b12 = d0*d3 + d1*d4 + d2*d5; double b13 = d0*d6 + d1*d7 + d2*d8; double b22 = d3*d3 + d4*d4 + d5*d5; double b23 = d3*d6 + d4*d7 + d5*d8; double b33 = d6*d6 + d7*d7 + d8*d8; // double a, b, c; // a = -(b11 + b22 + b33); // b = b11*(b22 + b33) + b22*b33 - b12*b12 - b13*b13 - b23*b23; // c = b11*(b23*b23 - b22*b33) + b12*(b12*b33 - 2*b13*b23) + b13*b13*b22; // double Q = (a * a - 3 * b) / 9; // double Q = (b12*b12 + b13*b13 + b23*b23 + // ((b11 - b22)*(b11 - b22) + // (b11 - b33)*(b11 - b33) + // (b22 - b33)*(b22 - b33))/6)/3; // Q = (3*(b12^2 + b13^2 + b23^2) + // ((b11 - b22)^2 + (b11 - b33)^2 + (b22 - b33)^2)/2)/9 // or // Q = (1/6)*|B-tr(B)/3|_F^2 // Q >= 0 and // Q = 0 <==> B = scalar * I // double R = (2 * a * a * a - 9 * a * b + 27 * c) / 54; double aa = (b11 + b22 + b33)/3; // aa = tr(B)/3 double c1, c2, c3; // c1 = b11 - aa; // ((b11 - b22) + (b11 - b33))/3 // c2 = b22 - aa; // ((b22 - b11) + (b22 - b33))/3 // c3 = b33 - aa; // ((b33 - b11) + (b33 - b22))/3 { double b11_b22 = ((d0-d3)*(d0+d3)+(d1-d4)*(d1+d4)+(d2-d5)*(d2+d5)); double b22_b33 = ((d3-d6)*(d3+d6)+(d4-d7)*(d4+d7)+(d5-d8)*(d5+d8)); double b33_b11 = ((d6-d0)*(d6+d0)+(d7-d1)*(d7+d1)+(d8-d2)*(d8+d2)); c1 = (b11_b22 - b33_b11)/3; c2 = (b22_b33 - b11_b22)/3; c3 = (b33_b11 - b22_b33)/3; } double Q, R; Q = (2*(b12*b12 + b13*b13 + b23*b23) + c1*c1 + c2*c2 + c3*c3)/6; R = (c1*(b23*b23 - c2*c3)+ b12*(b12*c3 - 2*b13*b23) +b13*b13*c2)/2; // R = (-1/2)*det(B-(tr(B)/3)*I) // Note: 54*(det(S))^2 <= |S|_F^6, when S^t=S and tr(S)=0, S is 3x3 // Therefore: R^2 <= Q^3 if (Q <= 0.) { ; } // else if (fabs(R) >= sqrtQ3) // { // double det = (d[0] * (d[4] * d[8] - d[5] * d[7]) + // d[3] * (d[2] * d[7] - d[1] * d[8]) + // d[6] * (d[1] * d[5] - d[2] * d[4])); // // if (R > 0.) // { // if (i == 2) // // aa -= 2*sqrtQ; // return fabs(det)/(aa + sqrtQ); // else // aa += sqrtQ; // } // else // { // if (i != 0) // aa -= sqrtQ; // // aa = fabs(det)/sqrt(aa + 2*sqrtQ); // else // aa += 2*sqrtQ; // } // } else { double sqrtQ = sqrt(Q); double sqrtQ3 = Q*sqrtQ; // double sqrtQ3 = sqrtQ*sqrtQ*sqrtQ; // double sqrtQ3 = pow(Q, 1.5); double r; if (fabs(R) >= sqrtQ3) { if (R < 0.) { // R = -1.; r = 2*sqrtQ; } else { // R = 1.; r = -2*sqrtQ; } } else { R = R/sqrtQ3; // if (fabs(R) <= 0.95) if (fabs(R) <= 0.9) { if (i == 2) { aa -= 2*sqrtQ*cos(acos(R)/3); // min } else if (i == 0) { aa -= 2*sqrtQ*cos((acos(R) + 2.0*M_PI)/3); // max } else { aa -= 2*sqrtQ*cos((acos(R) - 2.0*M_PI)/3); // mid } goto have_aa; } if (R < 0.) { r = -2*sqrtQ*cos((acos(R) + 2.0*M_PI)/3); // max if (i == 0) { aa += r; goto have_aa; } } else { r = -2*sqrtQ*cos(acos(R)/3); // min if (i == 2) { aa += r; goto have_aa; } } } // (tr(B)/3 + r) is the root which is separated from the other // two roots which are close to each other when |R| is close to 1 c1 -= r; c2 -= r; c3 -= r; // aa += r; // Type of Householder reflections: z --> mu ek, where k is the index // of the entry in z with: // mode == 0: smallest absolute value --> angle closest to pi/2 // (eliminate large entries) // mode == 1: largest absolute value --> angle farthest from pi/2 // (eliminate small entries) const int mode = 1; // Find a unit vector z = (z1,z2,z3) in the "near"-kernel of // | c1 b12 b13 | // | b12 c2 b23 | = B - aa*I // | b13 b23 c3 | // This vector is also an eigenvector for B corresponding to aa // The vector z overwrites (c1,c2,c3). switch (internal::KernelVector3S(mode, b12, b13, b23, c1, c2, c3)) { case 3: aa += r; goto have_aa; case 2: // ok, continue with the returned vector orthogonal to the kernel case 1: // ok, continue with the returned vector in the "near"-kernel ; } // Using the eigenvector c = (c1,c2,c3) to transform B into // | b11 0 0 | // B <-- Q P B P Q = | 0 b22 b23 | // | 0 b23 b33 | double v1, v2, v3, g; internal::Reduce3S(mode, b11, b22, b33, b12, b13, b23, c1, c2, c3, v1, v2, v3, g); // Q = I - g v v^t // P - permutation matrix switching rows and columns 1 and k // find the eigenvalues of // | b22 b23 | // | b23 b33 | internal::Eigenvalues2S(b23, b22, b33); if (i == 2) { aa = fmin(fmin(b11, b22), b33); } else if (i == 1) { if (b11 <= b22) { aa = (b22 <= b33) ? b22 : fmax(b11, b33); } else { aa = (b11 <= b33) ? b11 : fmax(b33, b22); } } else { aa = fmax(fmax(b11, b22), b33); } } have_aa: return sqrt(fabs(aa))*mult; // take abs before we sort? } /// Assuming L.U = P.A for a factored matrix (m x m), // compute x <- A x // // @param [in] data LU factorization of A // @param [in] m square matrix height // @param [in] ipiv array storing pivot information // @param [in, out] x vector storing right-hand side and then solution MFEM_HOST_DEVICE inline void LUSolve(const double *data, const int m, const int *ipiv, double *x) { // X <- P X for (int i = 0; i < m; i++) { internal::Swap(x[i], x[ipiv[i]]); } // X <- L^{-1} X for (int j = 0; j < m; j++) { const double x_j = x[j]; for (int i = j + 1; i < m; i++) { x[i] -= data[i + j * m] * x_j; } } // X <- U^{-1} X for (int j = m - 1; j >= 0; j--) { const double x_j = (x[j] /= data[j + j * m]); for (int i = 0; i < j; i++) { x[i] -= data[i + j * m] * x_j; } } } } // namespace kernels } // namespace mfem #endif // MFEM_LINALG_KERNELS_HPP