// Ceres Solver - A fast non-linear least squares minimizer // Copyright 2023 Google Inc. All rights reserved. // http://ceres-solver.org/ // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimer. // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // * Neither the name of Google Inc. nor the names of its contributors may be // used to endorse or promote products derived from this software without // specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. // // Author: sameeragarwal@google.com (Sameer Agarwal) #include "ceres/dense_cholesky.h" #include #include #include #include #include #include "ceres/internal/config.h" #include "ceres/iterative_refiner.h" #ifndef CERES_NO_CUDA #include "ceres/context_impl.h" #include "ceres/cuda_kernels_vector_ops.h" #include "cuda_runtime.h" #include "cusolverDn.h" #endif // CERES_NO_CUDA #ifndef CERES_NO_LAPACK // C interface to the LAPACK Cholesky factorization and triangular solve. extern "C" void dpotrf_( const char* uplo, const int* n, double* a, const int* lda, int* info); extern "C" void dpotrs_(const char* uplo, const int* n, const int* nrhs, const double* a, const int* lda, double* b, const int* ldb, int* info); extern "C" void spotrf_( const char* uplo, const int* n, float* a, const int* lda, int* info); extern "C" void spotrs_(const char* uplo, const int* n, const int* nrhs, const float* a, const int* lda, float* b, const int* ldb, int* info); #endif namespace ceres::internal { DenseCholesky::~DenseCholesky() = default; std::unique_ptr DenseCholesky::Create( const LinearSolver::Options& options) { std::unique_ptr dense_cholesky; switch (options.dense_linear_algebra_library_type) { case EIGEN: // Eigen mixed precision solver not yet implemented. if (options.use_mixed_precision_solves) { dense_cholesky = std::make_unique(); } else { dense_cholesky = std::make_unique(); } break; case LAPACK: #ifndef CERES_NO_LAPACK // LAPACK mixed precision solver not yet implemented. if (options.use_mixed_precision_solves) { dense_cholesky = std::make_unique(); } else { dense_cholesky = std::make_unique(); } break; #else LOG(FATAL) << "Ceres was compiled without support for LAPACK."; #endif case CUDA: #ifndef CERES_NO_CUDA if (options.use_mixed_precision_solves) { dense_cholesky = CUDADenseCholeskyMixedPrecision::Create(options); } else { dense_cholesky = CUDADenseCholesky::Create(options); } break; #else LOG(FATAL) << "Ceres was compiled without support for CUDA."; #endif default: LOG(FATAL) << "Unknown dense linear algebra library type : " << DenseLinearAlgebraLibraryTypeToString( options.dense_linear_algebra_library_type); } if (options.max_num_refinement_iterations > 0) { auto refiner = std::make_unique( options.max_num_refinement_iterations); dense_cholesky = std::make_unique( std::move(dense_cholesky), std::move(refiner)); } return dense_cholesky; } LinearSolverTerminationType DenseCholesky::FactorAndSolve( int num_cols, double* lhs, const double* rhs, double* solution, std::string* message) { LinearSolverTerminationType termination_type = Factorize(num_cols, lhs, message); if (termination_type == LinearSolverTerminationType::SUCCESS) { termination_type = Solve(rhs, solution, message); } return termination_type; } LinearSolverTerminationType EigenDenseCholesky::Factorize( int num_cols, double* lhs, std::string* message) { Eigen::Map m(lhs, num_cols, num_cols); llt_ = std::make_unique(m); if (llt_->info() != Eigen::Success) { *message = "Eigen failure. Unable to perform dense Cholesky factorization."; return LinearSolverTerminationType::FAILURE; } *message = "Success."; return LinearSolverTerminationType::SUCCESS; } LinearSolverTerminationType EigenDenseCholesky::Solve(const double* rhs, double* solution, std::string* message) { if (llt_->info() != Eigen::Success) { *message = "Eigen failure. Unable to perform dense Cholesky factorization."; return LinearSolverTerminationType::FAILURE; } VectorRef(solution, llt_->cols()) = llt_->solve(ConstVectorRef(rhs, llt_->cols())); *message = "Success."; return LinearSolverTerminationType::SUCCESS; } LinearSolverTerminationType FloatEigenDenseCholesky::Factorize( int num_cols, double* lhs, std::string* message) { // TODO(sameeragarwal): Check if this causes a double allocation. lhs_ = Eigen::Map(lhs, num_cols, num_cols).cast(); llt_ = std::make_unique(lhs_); if (llt_->info() != Eigen::Success) { *message = "Eigen failure. Unable to perform dense Cholesky factorization."; return LinearSolverTerminationType::FAILURE; } *message = "Success."; return LinearSolverTerminationType::SUCCESS; } LinearSolverTerminationType FloatEigenDenseCholesky::Solve( const double* rhs, double* solution, std::string* message) { if (llt_->info() != Eigen::Success) { *message = "Eigen failure. Unable to perform dense Cholesky factorization."; return LinearSolverTerminationType::FAILURE; } rhs_ = ConstVectorRef(rhs, llt_->cols()).cast(); solution_ = llt_->solve(rhs_); VectorRef(solution, llt_->cols()) = solution_.cast(); *message = "Success."; return LinearSolverTerminationType::SUCCESS; } #ifndef CERES_NO_LAPACK LinearSolverTerminationType LAPACKDenseCholesky::Factorize( int num_cols, double* lhs, std::string* message) { lhs_ = lhs; num_cols_ = num_cols; const char uplo = 'L'; int info = 0; dpotrf_(&uplo, &num_cols_, lhs_, &num_cols_, &info); if (info < 0) { termination_type_ = LinearSolverTerminationType::FATAL_ERROR; LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it. " << "LAPACK::dpotrf fatal error. " << "Argument: " << -info << " is invalid."; } else if (info > 0) { termination_type_ = LinearSolverTerminationType::FAILURE; *message = StringPrintf( "LAPACK::dpotrf numerical failure. " "The leading minor of order %d is not positive definite.", info); } else { termination_type_ = LinearSolverTerminationType::SUCCESS; *message = "Success."; } return termination_type_; } LinearSolverTerminationType LAPACKDenseCholesky::Solve(const double* rhs, double* solution, std::string* message) { const char uplo = 'L'; const int nrhs = 1; int info = 0; VectorRef(solution, num_cols_) = ConstVectorRef(rhs, num_cols_); dpotrs_( &uplo, &num_cols_, &nrhs, lhs_, &num_cols_, solution, &num_cols_, &info); if (info < 0) { termination_type_ = LinearSolverTerminationType::FATAL_ERROR; LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it. " << "LAPACK::dpotrs fatal error. " << "Argument: " << -info << " is invalid."; } *message = "Success"; termination_type_ = LinearSolverTerminationType::SUCCESS; return termination_type_; } LinearSolverTerminationType FloatLAPACKDenseCholesky::Factorize( int num_cols, double* lhs, std::string* message) { num_cols_ = num_cols; lhs_ = Eigen::Map(lhs, num_cols, num_cols).cast(); const char uplo = 'L'; int info = 0; spotrf_(&uplo, &num_cols_, lhs_.data(), &num_cols_, &info); if (info < 0) { termination_type_ = LinearSolverTerminationType::FATAL_ERROR; LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it. " << "LAPACK::spotrf fatal error. " << "Argument: " << -info << " is invalid."; } else if (info > 0) { termination_type_ = LinearSolverTerminationType::FAILURE; *message = StringPrintf( "LAPACK::spotrf numerical failure. " "The leading minor of order %d is not positive definite.", info); } else { termination_type_ = LinearSolverTerminationType::SUCCESS; *message = "Success."; } return termination_type_; } LinearSolverTerminationType FloatLAPACKDenseCholesky::Solve( const double* rhs, double* solution, std::string* message) { const char uplo = 'L'; const int nrhs = 1; int info = 0; rhs_and_solution_ = ConstVectorRef(rhs, num_cols_).cast(); spotrs_(&uplo, &num_cols_, &nrhs, lhs_.data(), &num_cols_, rhs_and_solution_.data(), &num_cols_, &info); if (info < 0) { termination_type_ = LinearSolverTerminationType::FATAL_ERROR; LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it. " << "LAPACK::dpotrs fatal error. " << "Argument: " << -info << " is invalid."; } *message = "Success"; termination_type_ = LinearSolverTerminationType::SUCCESS; VectorRef(solution, num_cols_) = rhs_and_solution_.head(num_cols_).cast(); return termination_type_; } #endif // CERES_NO_LAPACK RefinedDenseCholesky::RefinedDenseCholesky( std::unique_ptr dense_cholesky, std::unique_ptr iterative_refiner) : dense_cholesky_(std::move(dense_cholesky)), iterative_refiner_(std::move(iterative_refiner)) {} RefinedDenseCholesky::~RefinedDenseCholesky() = default; LinearSolverTerminationType RefinedDenseCholesky::Factorize( const int num_cols, double* lhs, std::string* message) { lhs_ = lhs; num_cols_ = num_cols; return dense_cholesky_->Factorize(num_cols, lhs, message); } LinearSolverTerminationType RefinedDenseCholesky::Solve(const double* rhs, double* solution, std::string* message) { CHECK(lhs_ != nullptr); auto termination_type = dense_cholesky_->Solve(rhs, solution, message); if (termination_type != LinearSolverTerminationType::SUCCESS) { return termination_type; } iterative_refiner_->Refine( num_cols_, lhs_, rhs, dense_cholesky_.get(), solution); return LinearSolverTerminationType::SUCCESS; } #ifndef CERES_NO_CUDA CUDADenseCholesky::CUDADenseCholesky(ContextImpl* context) : context_(context), lhs_{context}, rhs_{context}, device_workspace_{context}, error_(context, 1) {} LinearSolverTerminationType CUDADenseCholesky::Factorize(int num_cols, double* lhs, std::string* message) { factorize_result_ = LinearSolverTerminationType::FATAL_ERROR; lhs_.Reserve(num_cols * num_cols); num_cols_ = num_cols; lhs_.CopyFromCpu(lhs, num_cols * num_cols); int device_workspace_size = 0; if (cusolverDnDpotrf_bufferSize(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols, lhs_.data(), num_cols, &device_workspace_size) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDpotrf_bufferSize failed."; return LinearSolverTerminationType::FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); if (cusolverDnDpotrf(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols, lhs_.data(), num_cols, reinterpret_cast(device_workspace_.data()), device_workspace_.size(), error_.data()) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDpotrf failed."; return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; error_.CopyToCpu(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " << "please report it. " << "cuSolverDN::cusolverDnXpotrf fatal error. " << "Argument: " << -error << " is invalid."; // The following line is unreachable, but return failure just to be // pedantic, since the compiler does not know that. return LinearSolverTerminationType::FATAL_ERROR; } else if (error > 0) { *message = StringPrintf( "cuSolverDN::cusolverDnDpotrf numerical failure. " "The leading minor of order %d is not positive definite.", error); factorize_result_ = LinearSolverTerminationType::FAILURE; return LinearSolverTerminationType::FAILURE; } *message = "Success"; factorize_result_ = LinearSolverTerminationType::SUCCESS; return LinearSolverTerminationType::SUCCESS; } LinearSolverTerminationType CUDADenseCholesky::Solve(const double* rhs, double* solution, std::string* message) { if (factorize_result_ != LinearSolverTerminationType::SUCCESS) { *message = "Factorize did not complete successfully previously."; return factorize_result_; } rhs_.CopyFromCpu(rhs, num_cols_); if (cusolverDnDpotrs(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, 1, lhs_.data(), num_cols_, rhs_.data(), num_cols_, error_.data()) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDpotrs failed."; return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; error_.CopyToCpu(&error, 1); if (error != 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." << "cuSolverDN::cusolverDnDpotrs fatal error. " << "Argument: " << -error << " is invalid."; } rhs_.CopyToCpu(solution, num_cols_); *message = "Success"; return LinearSolverTerminationType::SUCCESS; } std::unique_ptr CUDADenseCholesky::Create( const LinearSolver::Options& options) { if (options.dense_linear_algebra_library_type != CUDA || options.context == nullptr || !options.context->IsCudaInitialized()) { return nullptr; } return std::unique_ptr( new CUDADenseCholesky(options.context)); } std::unique_ptr CUDADenseCholeskyMixedPrecision::Create(const LinearSolver::Options& options) { if (options.dense_linear_algebra_library_type != CUDA || !options.use_mixed_precision_solves || options.context == nullptr || !options.context->IsCudaInitialized()) { return nullptr; } return std::unique_ptr( new CUDADenseCholeskyMixedPrecision( options.context, options.max_num_refinement_iterations)); } LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskyFactorize(std::string* message) { int device_workspace_size = 0; if (cusolverDnSpotrf_bufferSize(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, lhs_fp32_.data(), num_cols_, &device_workspace_size) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnSpotrf_bufferSize failed."; return LinearSolverTerminationType::FATAL_ERROR; } device_workspace_.Reserve(device_workspace_size); if (cusolverDnSpotrf(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, lhs_fp32_.data(), num_cols_, device_workspace_.data(), device_workspace_.size(), error_.data()) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnSpotrf failed."; return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; error_.CopyToCpu(&error, 1); if (error < 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres - " << "please report it. " << "cuSolverDN::cusolverDnSpotrf fatal error. " << "Argument: " << -error << " is invalid."; // The following line is unreachable, but return failure just to be // pedantic, since the compiler does not know that. return LinearSolverTerminationType::FATAL_ERROR; } if (error > 0) { *message = StringPrintf( "cuSolverDN::cusolverDnSpotrf numerical failure. " "The leading minor of order %d is not positive definite.", error); factorize_result_ = LinearSolverTerminationType::FAILURE; return LinearSolverTerminationType::FAILURE; } *message = "Success"; return LinearSolverTerminationType::SUCCESS; } LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::CudaCholeskySolve( std::string* message) { CHECK_EQ(cudaMemcpyAsync(correction_fp32_.data(), residual_fp32_.data(), num_cols_ * sizeof(float), cudaMemcpyDeviceToDevice, context_->DefaultStream()), cudaSuccess); if (cusolverDnSpotrs(context_->cusolver_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, 1, lhs_fp32_.data(), num_cols_, correction_fp32_.data(), num_cols_, error_.data()) != CUSOLVER_STATUS_SUCCESS) { *message = "cuSolverDN::cusolverDnDpotrs failed."; return LinearSolverTerminationType::FATAL_ERROR; } int error = 0; error_.CopyToCpu(&error, 1); if (error != 0) { LOG(FATAL) << "Congratulations, you found a bug in Ceres. " << "Please report it." << "cuSolverDN::cusolverDnDpotrs fatal error. " << "Argument: " << -error << " is invalid."; } *message = "Success"; return LinearSolverTerminationType::SUCCESS; } CUDADenseCholeskyMixedPrecision::CUDADenseCholeskyMixedPrecision( ContextImpl* context, int max_num_refinement_iterations) : context_(context), lhs_fp64_{context}, rhs_fp64_{context}, lhs_fp32_{context}, device_workspace_{context}, error_(context, 1), x_fp64_{context}, correction_fp32_{context}, residual_fp32_{context}, residual_fp64_{context}, max_num_refinement_iterations_(max_num_refinement_iterations) {} LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Factorize( int num_cols, double* lhs, std::string* message) { num_cols_ = num_cols; // Copy fp64 version of lhs to GPU. lhs_fp64_.Reserve(num_cols * num_cols); lhs_fp64_.CopyFromCpu(lhs, num_cols * num_cols); // Create an fp32 copy of lhs, lhs_fp32. lhs_fp32_.Reserve(num_cols * num_cols); CudaFP64ToFP32(lhs_fp64_.data(), lhs_fp32_.data(), num_cols * num_cols, context_->DefaultStream()); // Factorize lhs_fp32. factorize_result_ = CudaCholeskyFactorize(message); return factorize_result_; } LinearSolverTerminationType CUDADenseCholeskyMixedPrecision::Solve( const double* rhs, double* solution, std::string* message) { // If factorization failed, return failure. if (factorize_result_ != LinearSolverTerminationType::SUCCESS) { *message = "Factorize did not complete successfully previously."; return factorize_result_; } // Reserve memory for all arrays. rhs_fp64_.Reserve(num_cols_); x_fp64_.Reserve(num_cols_); correction_fp32_.Reserve(num_cols_); residual_fp32_.Reserve(num_cols_); residual_fp64_.Reserve(num_cols_); // Initialize x = 0. CudaSetZeroFP64(x_fp64_.data(), num_cols_, context_->DefaultStream()); // Initialize residual = rhs. rhs_fp64_.CopyFromCpu(rhs, num_cols_); residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_); for (int i = 0; i <= max_num_refinement_iterations_; ++i) { // Cast residual from fp64 to fp32. CudaFP64ToFP32(residual_fp64_.data(), residual_fp32_.data(), num_cols_, context_->DefaultStream()); // [fp32] c = lhs^-1 * residual. auto result = CudaCholeskySolve(message); if (result != LinearSolverTerminationType::SUCCESS) { return result; } // [fp64] x += c. CudaDsxpy(x_fp64_.data(), correction_fp32_.data(), num_cols_, context_->DefaultStream()); if (i < max_num_refinement_iterations_) { // [fp64] residual = rhs - lhs * x // This is done in two steps: // 1. [fp64] residual = rhs residual_fp64_.CopyFromGPUArray(rhs_fp64_.data(), num_cols_); // 2. [fp64] residual = residual - lhs * x double alpha = -1.0; double beta = 1.0; cublasDsymv(context_->cublas_handle_, CUBLAS_FILL_MODE_LOWER, num_cols_, &alpha, lhs_fp64_.data(), num_cols_, x_fp64_.data(), 1, &beta, residual_fp64_.data(), 1); } } x_fp64_.CopyToCpu(solution, num_cols_); *message = "Success."; return LinearSolverTerminationType::SUCCESS; } #endif // CERES_NO_CUDA } // namespace ceres::internal