#include "ParSparseMatrix.h" #include #ifdef TIMING_ON double time_diff(timespec &start, timespec &end) { return (double)(1e9 * (end.tv_sec - start.tv_sec) + end.tv_nsec - start.tv_nsec) / 1e9; } #endif using namespace MPI_Wrappers; using namespace std; namespace ATC_matrix { // All the same constructors as for SparseMatrix ParSparseMatrix::ParSparseMatrix(MPI_Comm comm, INDEX rows, INDEX cols) : SparseMatrix(rows, cols), _comm(comm) {} ParSparseMatrix::ParSparseMatrix(MPI_Comm comm, const SparseMatrix &c) : SparseMatrix(c), _comm(comm) {} ParSparseMatrix::ParSparseMatrix(MPI_Comm comm, INDEX* rows, INDEX* cols, double* vals, INDEX size, INDEX nRows, INDEX nCols, INDEX nRowsCRS) : SparseMatrix(rows, cols, vals, size, nRows, nCols, nRowsCRS), _comm(comm) {} //============================================================ void ParSparseMatrix::MultMv(const Vector& v, DenseVector& c) const { int numProcs = MPI_Wrappers::size(_comm); #ifdef DISABLE_PAR_HEURISTICS // Use much more lenient heuristics to exercise parallel code if (numProcs == 1 || _size < 300) { #else // These are simple heuristics to perform multiplication in serial if // parallel will be slower. They were determined experimentally. if ( numProcs == 1 || (_size < 50000 || _size > 10000000) || ((_size < 150000 || _size > 5000000) && numProcs > 8) || ((_size < 500000 || _size > 2500000) && numProcs > 16 ) || (numProcs > 32)) { #endif SparseMatrix::MultMv(v, c); return; } SparseMatrix::compress(*this); GCK(*this, v, this->nCols() != v.size(), "ParSparseMatrix * Vector") SparseMatrix A_local; // Split the sparse matrix. partition() takes a ParSparMat, so we cast. partition(*static_cast*>(&A_local)); // actually do multiplication - end up with partial result vector // on each processor #ifdef TIMING_ON timespec before, after; // barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &before); #endif DenseVector c_local = A_local * v; #ifdef TIMING_ON // barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &after); cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " mat.vec time\n"; //LammpsInterface::instance()->all_print((after-before),"mat.vec time"); barrier(MPI_COMM_WORLD); #endif // destroy A_local intelligently static_cast*>(&A_local)->finalize(); // Add all the result vectors together on each processor. #ifdef TIMING_ON barrier(MPI_COMM_WORLD); //barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &before); #endif allsum(_comm, c_local.ptr(), c.ptr(), c_local.size()); #ifdef TIMING_ON //barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &after); cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(before,after) << " allsum time\n"; //LammpsInterface::instance()->print_msg_once((after-before),"allsum time"); #endif } DenseVector ParSparseMatrix::transMat( const Vector& v) const { SparseMatrix::compress(*this); GCK(*this, v, this->nRows() != v.size(), "ParSparseMatrix transpose * Vector") DenseVector c(nCols(), true); SparseMatrix A_local; partition(*static_cast*>(&A_local)); // actually do multiplication - end up with partial result vector // on each processor DenseVector c_local = A_local.transMat(v); static_cast*>(&A_local)->finalize(); // Add all the result vectors together on each processor. allsum(_comm, c_local.ptr(), c.ptr(), c_local.size()); return c; } void ParSparseMatrix::MultAB(const Matrix& B, DenseMatrix& C) const { SparseMatrix::compress(*this); GCK(*this, B, this->nCols() != B.nRows(), "ParSparseMatrix * Matrix") SparseMatrix A_local; partition(*static_cast*>(&A_local)); // actually do multiplication - end up with partial result matrix // on each processor #ifdef TIMING_ON timespec before, after; barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &before); #endif DenseMatrix C_local = A_local * B; #ifdef TIMING_ON barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &after); cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(after,before) << " mat.vec time\n"; //LammpsInterface::instance()->all_print((after-before),"mat.vec time"); #endif static_cast*>(&A_local)->finalize(); // Add all the result vectors together on each processor. #ifdef TIMING_ON barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &before); #endif allsum(_comm, C_local.ptr(), C.ptr(), C_local.size()); #ifdef TIMING_ON barrier(MPI_COMM_WORLD); clock_gettime(CLOCK_MONOTONIC, &after); cout << "P" << MPI_Wrappers::rank(MPI_COMM_WORLD) << " " << time_diff(after,before) << " allsum time\n"; //LammpsInterface::instance()->print_msg_once((after-before),"allsum time"); #endif } DenseMatrix ParSparseMatrix::transMat( const DenseMatrix& B) const { SparseMatrix::compress(*this); GCK(*this, B, this->nRows() != B.nRows(), "ParSparseMatrix transpose * Matrix") DenseMatrix C(nCols(), B.nCols(), true); SparseMatrix A_local; partition(*static_cast*>(&A_local)); // actually do multiplication - end up with partial result matrix // on each processor DenseMatrix C_local = A_local.transMat(B); static_cast*>(&A_local)->finalize(); // Add all the result vectors together on each processor. allsum(_comm, C_local.ptr(), C.ptr(), C_local.size()); return C; } /* The two commented-out functions both need to return SparseMatrices. It's hard to combine sparse matrices between processors, so this has not yet been completed. void ParMultAB(const SparseMatrix &B, SparseMatrix &C) const { //SparseMatrix::compress(*this); GCK(*this, B, this->nCols()!=B.nRows(), "ParSparseMatrix * SparseMatrix") ParSparseMatrix A_local(this->_comm); this->partition(A_local); // actually do multiplication - end up with partial result matrix // on each processor SparseMatrix C_local = ((SparseMatrix)A_local) * B; // destroy newA intelligently static_cast*>(&A_local)->finalize(); // Add all the result vectors together on each processor. sumSparse(C_local, C); }*/ DenseMatrix ParSparseMatrix::transMat( const SparseMatrix& B) const { SparseMatrix::compress(*this); GCK(*this, B, this->nRows() != B.nRows(), "ParSparseMatrix transpose * SparseMatrix") DenseMatrix C(nCols(), B.nCols(), true); SparseMatrix A_local; partition(*static_cast*>(&A_local)); // actually do multiplication - end up with partial result matrix // on each processor DenseMatrix C_local = A_local.transMat(B); static_cast*>(&A_local)->finalize(); // Add all the result vectors together on each processor. allsum(_comm, C_local.ptr(), C.ptr(), C_local.size()); return C; } /*void ParMultAB(const DiagonalMatrix &B, SparseMatrix &C) const { //SparseMatrix::compress(*this); GCK(*this, B, this->nCols()!=B.nRows(), "ParSparseMatrix * DiagonalMatrix") ParSparseMatrix A_local(this->_comm); this->partition(A_local); // actually do multiplication - end up with partial result matrix // on each processor SparseMatrix C_local = ((SparseMatrix)A_local) * B; // destroy newA intelligently A_local._val = NULL; A_local._ja = NULL; // Add all the result vectors together on each processor. sumSparse(C_local, C); }*/ void ParSparseMatrix::partition( ParSparseMatrix& A_local) const { // create new sparse matrix on each processor, with same size and // a disjoint subset of A's elements. // // Ex: on two processors, // // |0 1 0| |0 1 0| |0 0 0| // |2 6 0| splits into |2 0 0| on proc 1 and |0 6 0| on proc 2 // |0 0 3| |0 0 0| |0 0 3| // // We compute the subproducts individually on each processor, then // sum up all the vectors to get our final result. // // decide which elements will be in each submatrix INDEX startIndex = (MPI_Wrappers::rank(_comm) * size()) / MPI_Wrappers::size(_comm); INDEX endIndex = ((MPI_Wrappers::rank(_comm) + 1) * size()) / MPI_Wrappers::size(_comm); // update number of elements A_local._nRows = _nRows; A_local._nCols = _nCols; A_local._size = endIndex - startIndex; A_local._nRowsCRS = _nRowsCRS; // use pointer arithmetic to: // set newA's _val (to inside A's _val) A_local._val = _val + startIndex; // set newA's _ja (to inside A's _ja) A_local._ja = _ja + startIndex; // set newA's _ia (from scratch) A_local._ia = new INDEX[nRowsCRS() + 1]; INDEX numRows = nRowsCRS(); if (A_local._size > 0) { for (INDEX i = 0; i < numRows + 1; i++) { A_local._ia[i] = std::min(std::max((_ia[i] - startIndex), 0), endIndex - startIndex); } } else { A_local._nRowsCRS = 0; } } // Prepare an A_local matrix for deletion after it has been loaded with // data members from another matrix. void ParSparseMatrix::finalize() { _val = NULL; _ja = NULL; } void ParSparseMatrix::operator=(const SparseMatrix &source) { copy(source); } /*void sumSparse(SparseMatrix &C_local, SparseMatrix &C) { }*/ }