From 46075d305ad43c906ff7bba84ec4c6d4488e4773 Mon Sep 17 00:00:00 2001 From: Mark Hoemmen Date: Wed, 24 Aug 2016 09:02:25 -0600 Subject: [PATCH] Ifpack2: Improve LocalSparseTriangularSolver unit test @trilinos/ifpack2 Test PASSES. Add a new unit test for LocalSparseTriangularSolver. It exercises lower and upper triangular solves with a matrix whose LU factorization is known and exact (in binary floating-point arithmetic), and a resulting linear system whose solution is exact (again in binary floating-point arithmetic). Exactness means we don't need an error tolerance. (This assumes that the triangular solve is exact. It's possible to do approximate triangular solves. We ignore this case for now.) This relates to #558 and other outstanding issues. Given that LocalSparseTriangularSolver behaves correctly, we can now focus on classes that use it. For example, it would make sense to extend the "arrow matrix" test case in this new test, to include "remote" entries that Ifpack2::LocalFilter should filter out. I also instrumented Ifpack2::LocalSparseTriangularSolver with optional debug output. It takes a Teuchos::FancyOStream, which helps us get debug output in sync with unit test output. Build/Test Cases Summary Enabled Packages: Ifpack2 Disabled Packages: FEI,PyTrilinos,Moertel,STK,SEACAS,ThreadPool,OptiPack,Rythmos,Intrepid,ROL 0) MPI_DEBUG => passed: passed=29,notpassed=0 (7.80 min) 1) SERIAL_RELEASE => passed: passed=25,notpassed=0 (6.10 min) --- ...pack2_LocalSparseTriangularSolver_decl.hpp | 15 + ...fpack2_LocalSparseTriangularSolver_def.hpp | 55 ++ ...k2_UnitTestLocalSparseTriangularSolver.cpp | 890 ++++++++++++++++++ 3 files changed, 960 insertions(+) diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp index 6de32a7a5810..158a495f4a38 100644 --- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_decl.hpp @@ -45,6 +45,7 @@ #include "Ifpack2_Preconditioner.hpp" #include "Ifpack2_Details_CanChangeMatrix.hpp" +#include "Teuchos_FancyOStream.hpp" #include #ifndef DOXYGEN_SHOULD_SKIP_THIS @@ -143,6 +144,17 @@ class LocalSparseTriangularSolver : /// those properties. LocalSparseTriangularSolver (const Teuchos::RCP& A); + /// \brief Constructor that takes an optional debug output stream. + /// + /// \param A [in] The input sparse matrix. Though its type is + /// Tpetra::RowMatrix for consistency with other Ifpack2 solvers, + /// this must be a Tpetra::CrsMatrix specialization. + /// + /// \param out [in/out] Optional debug output stream. If nonnull, + /// this solver will print copious debug output to the stream. + LocalSparseTriangularSolver (const Teuchos::RCP& A, + const Teuchos::RCP& out); + //! Destructor (virtual for memory safety). virtual ~LocalSparseTriangularSolver (); @@ -296,11 +308,14 @@ class LocalSparseTriangularSolver : private: //! The original input matrix. Teuchos::RCP A_; + //! Debug output stream; may be null (not used in that case) + Teuchos::RCP out_; //! The original input matrix, as a Tpetra::CrsMatrix. Teuchos::RCP > A_crs_; + typedef Tpetra::MultiVector MV; mutable Teuchos::RCP X_colMap_; mutable Teuchos::RCP Y_rowMap_; diff --git a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp index 827d1cf68820..0d20e63f5f47 100644 --- a/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp +++ b/packages/ifpack2/src/Ifpack2_LocalSparseTriangularSolver_def.hpp @@ -74,6 +74,38 @@ LocalSparseTriangularSolver (const Teuchos::RCP& A) : } } +template +LocalSparseTriangularSolver:: +LocalSparseTriangularSolver (const Teuchos::RCP& A, + const Teuchos::RCP& out) : + A_ (A), + out_ (out), + isInitialized_ (false), + isComputed_ (false), + numInitialize_ (0), + numCompute_ (0), + numApply_ (0), + initializeTime_ (0.0), + computeTime_ (0.0), + applyTime_ (0.0) +{ + if (! out_.is_null ()) { + *out_ << ">>> DEBUG Ifpack2::LocalSparseTriangularSolver constructor" + << std::endl; + } + typedef typename Tpetra::CrsMatrix crs_matrix_type; + if (! A.is_null ()) { + Teuchos::RCP A_crs = + Teuchos::rcp_dynamic_cast (A); + TEUCHOS_TEST_FOR_EXCEPTION + (A_crs.is_null (), std::invalid_argument, + "Ifpack2::LocalSparseTriangularSolver constructor: " + "The input matrix A is not a Tpetra::CrsMatrix."); + A_crs_ = A_crs; + } +} + template LocalSparseTriangularSolver:: ~LocalSparseTriangularSolver () @@ -91,6 +123,9 @@ LocalSparseTriangularSolver:: initialize () { const char prefix[] = "Ifpack2::LocalSparseTriangularSolver::initialize: "; + if (! out_.is_null ()) { + *out_ << ">>> DEBUG " << prefix << std::endl; + } TEUCHOS_TEST_FOR_EXCEPTION (A_.is_null (), std::runtime_error, prefix << "You must call " @@ -128,6 +163,9 @@ LocalSparseTriangularSolver:: compute () { const char prefix[] = "Ifpack2::LocalSparseTriangularSolver::compute: "; + if (! out_.is_null ()) { + *out_ << ">>> DEBUG " << prefix << std::endl; + } TEUCHOS_TEST_FOR_EXCEPTION (A_.is_null (), std::runtime_error, prefix << "You must call " @@ -169,6 +207,23 @@ apply (const Tpetra::MultiVector STS; const char prefix[] = "Ifpack2::LocalSparseTriangularSolver::apply: "; + if (! out_.is_null ()) { + *out_ << ">>> DEBUG " << prefix; + if (A_crs_.is_null ()) { + *out_ << "A_crs_ is null!" << std::endl; + } + else { + const std::string uplo = A_crs_->isUpperTriangular () ? "U" : + (A_crs_->isLowerTriangular () ? "L" : "N"); + const std::string trans = (mode == Teuchos::CONJ_TRANS) ? "C" : + (mode == Teuchos::TRANS ? "T" : "N"); + const std::string diag = + (A_crs_->getNodeNumDiags () < A_crs_->getNodeNumRows ()) ? "U" : "N"; + *out_ << "uplo=\"" << uplo + << "\", trans=\"" << trans + << "\", diag=\"" << diag << "\"" << std::endl; + } + } TEUCHOS_TEST_FOR_EXCEPTION (! isComputed (), std::runtime_error, prefix << "If compute() has not yet " diff --git a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp index 935f61eda0c0..f1a5901ae160 100644 --- a/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp +++ b/packages/ifpack2/test/unit_tests/Ifpack2_UnitTestLocalSparseTriangularSolver.cpp @@ -46,9 +46,12 @@ #include "Teuchos_UnitTestHarness.hpp" #include "Ifpack2_LocalSparseTriangularSolver.hpp" +#include "Tpetra_Details_gathervPrint.hpp" +#include "Tpetra_Experimental_BlockView.hpp" #include "Tpetra_CrsMatrix.hpp" #include "Tpetra_MultiVector.hpp" #include "Tpetra_DefaultPlatform.hpp" +#include namespace { // (anonymous) @@ -484,6 +487,893 @@ TEUCHOS_UNIT_TEST_TEMPLATE_3_DECL(LocalSparseTriangularSolver, CompareToLocalSol } } +// Solve Ax = b for x, where A is either upper or lower triangular. +// We don't implement the transpose or conjugate transpose cases here. +template +void +TRSV (VectorType& x, + const CrsMatrixType& A, + VectorType& b, + const char uplo[], + const char trans[], + const char diag[]) +{ + typedef typename CrsMatrixType::scalar_type SC; + //typedef typename CrsMatrixType::impl_scalar_type val_type; + typedef typename CrsMatrixType::local_ordinal_type LO; + typedef typename CrsMatrixType::global_ordinal_type GO; + + bool upper = false; + if (uplo[0] == 'U' || uplo[0] == 'u') { + upper = true; + } + else if (uplo[0] == 'L' || uplo[0] == 'l') { + upper = false; // lower instead + } + else { + TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid uplo"); + } + + bool transpose = false; + bool conjugate = false; + if (trans[0] == 'T' || trans[0] == 't') { + transpose = true; + conjugate = false; + } + else if (trans[0] == 'C' || trans[0] == 'c') { + transpose = true; + conjugate = true; + } + else if (trans[0] == 'H' || trans[0] == 'h') { + transpose = true; + conjugate = true; + } + else if (trans[0] == 'N' || trans[0] == 'n') { + transpose = false; + conjugate = false; + } + else { + TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid trans"); + } + + TEUCHOS_TEST_FOR_EXCEPTION + (transpose || conjugate, std::logic_error, "Transpose and conjugate " + "transpose cases not implemented"); + + bool implicitUnitDiag = false; + if (diag[0] == 'U' || diag[0] == 'u') { + implicitUnitDiag = true; + } + else if (diag[0] == 'N' || diag[0] == 'n') { + implicitUnitDiag = false; + } + else { + TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid diag"); + } + + const auto colMap = A.getColMap (); + const auto rowMap = A.getRowMap (); + const LO lclNumRows = static_cast (rowMap->getNodeNumElements ()); + + // x is write only, but we sync anyway, just to be sure + x.template sync (); + x.template modify (); + auto x_lcl_2d = x.template getLocalView (); + auto x_lcl_1d = Kokkos::subview (x_lcl_2d, Kokkos::ALL (), 0); + Kokkos::deep_copy (x_lcl_1d, Kokkos::Details::ArithTraits::zero ()); + + // b is read only + b.template sync (); + auto b_lcl_2d = b.template getLocalView (); + auto b_lcl_1d = Kokkos::subview (b_lcl_2d, Kokkos::ALL (), 0); + + // Use global indices, in case the row and column Maps differ. + // This helps us check what the actual diagonal entry is. + Teuchos::Array valsBuf; + Teuchos::Array gblColIndsBuf; + + if (upper) { + // NOTE (mfh 23 Aug 2016) In the upper triangular case, we count + // from N down to 1, to avoid an infinite loop if LO is unsigned. + // (Unsigned numbers are ALWAYS >= 0.) + for (LO lclRowPlusOne = lclNumRows; lclRowPlusOne > static_cast (0); --lclRowPlusOne) { + const LO lclRow = lclRowPlusOne - static_cast (1); + + const size_t numEnt = A.getNumEntriesInLocalRow (lclRow); + if (static_cast (valsBuf.size ()) < numEnt) { + valsBuf.resize (numEnt); + } + if (static_cast (gblColIndsBuf.size ()) < numEnt) { + gblColIndsBuf.resize (numEnt); + } + + Teuchos::ArrayView gblColInds = gblColIndsBuf (0, numEnt); + Teuchos::ArrayView vals = valsBuf (0, numEnt); + + const GO gblRow = rowMap->getGlobalElement (lclRow); + size_t numEnt2 = 0; + A.getGlobalRowCopy (gblRow, gblColInds, vals, numEnt2); + TEUCHOS_TEST_FOR_EXCEPT( numEnt != numEnt2 ); + TEUCHOS_TEST_FOR_EXCEPT( numEnt != numEnt2 ); + TEUCHOS_TEST_FOR_EXCEPT( static_cast (gblColInds.size ()) != numEnt ); + TEUCHOS_TEST_FOR_EXCEPT( static_cast (vals.size ()) != numEnt ); + + // Go through the row. It doesn't matter if the matrix is upper + // or lower triangular here; what matters is distinguishing the + // diagonal entry. + SC x_i = static_cast (b_lcl_1d[lclRow]); + SC diagVal = Teuchos::ScalarTraits::one (); + for (size_t k = 0; k < numEnt; ++k) { + if (gblColInds[k] == gblRow) { + TEUCHOS_TEST_FOR_EXCEPT( implicitUnitDiag && gblColInds[k] == gblRow ); + diagVal = vals[k]; + } + else { + const LO lclCol = colMap->getLocalElement (gblColInds[k]); + TEUCHOS_TEST_FOR_EXCEPT( lclCol == Teuchos::OrdinalTraits::invalid () ); + x_i = x_i - vals[k] * x_lcl_1d[lclCol]; + } + } + + // Update the output entry + { + const LO lclCol = colMap->getLocalElement (gblRow); + TEUCHOS_TEST_FOR_EXCEPT( lclCol == Teuchos::OrdinalTraits::invalid () ); + x_lcl_1d[lclCol] = x_i / diagVal; + } + } // for each local row + } + else { // lower triangular + for (LO lclRow = 0; lclRow < lclNumRows; ++lclRow) { + const size_t numEnt = A.getNumEntriesInLocalRow (lclRow); + if (static_cast (valsBuf.size ()) < numEnt) { + valsBuf.resize (numEnt); + } + if (static_cast (gblColIndsBuf.size ()) < numEnt) { + gblColIndsBuf.resize (numEnt); + } + + Teuchos::ArrayView gblColInds = gblColIndsBuf (0, numEnt); + Teuchos::ArrayView vals = valsBuf (0, numEnt); + + const GO gblRow = rowMap->getGlobalElement (lclRow); + size_t numEnt2 = 0; + A.getGlobalRowCopy (gblRow, gblColInds, vals, numEnt2); + TEUCHOS_TEST_FOR_EXCEPT( numEnt != numEnt2 ); + TEUCHOS_TEST_FOR_EXCEPT( numEnt != numEnt2 ); + TEUCHOS_TEST_FOR_EXCEPT( static_cast (gblColInds.size ()) != numEnt ); + TEUCHOS_TEST_FOR_EXCEPT( static_cast (vals.size ()) != numEnt ); + + // Go through the row. It doesn't matter if the matrix is upper + // or lower triangular here; what matters is distinguishing the + // diagonal entry. + SC x_i = static_cast (b_lcl_1d[lclRow]); + SC diagVal = Teuchos::ScalarTraits::one (); + for (size_t k = 0; k < numEnt; ++k) { + if (gblColInds[k] == gblRow) { + TEUCHOS_TEST_FOR_EXCEPT( implicitUnitDiag && gblColInds[k] == gblRow ); + diagVal = vals[k]; + } + else { + const LO lclCol = colMap->getLocalElement (gblColInds[k]); + TEUCHOS_TEST_FOR_EXCEPT( lclCol == Teuchos::OrdinalTraits::invalid () ); + x_i = x_i - vals[k] * x_lcl_1d[lclCol]; + } + } + + // Update the output entry + { + const LO lclCol = colMap->getLocalElement (gblRow); + TEUCHOS_TEST_FOR_EXCEPT( lclCol == Teuchos::OrdinalTraits::invalid () ); + x_lcl_1d[lclCol] = x_i / diagVal; + } + } // for each local row + } // upper or lower triangular + + x.template sync (); +} + +// Consider a real arrow matrix (arrow pointing down and right) with +// diagonal entries d and other nonzero entries 1. Here is a 4 x 4 +// example: +// +// [d 1] +// [ d 1] +// [ d 1] +// [1 1 1 d] +// +// Compute its LU factorization without pivoting, assuming that all +// the values exist: +// +// [1 ] [d 1 ] +// [ 1 ] [ d 1 ] +// [ 1 ] [ d 1 ] +// [1/d 1/d 1/d 1] [ d - 3/d] +// +// Generalize the pattern: the off-diagonal nonzero entries of the L +// factor are all 1/d, and the lower right entry of U is d - (n-1)/d, +// where the original matrix A is n by n. If d is positive and big +// enough, say d >= 2n, then all diagonal entries of U will be +// sufficiently large for this factorization to make sense. +// Furthermore, if d is a power of 2, then 1/d is exact in binary +// floating-point arithmetic (if it doesn't overflow), as is (1/d + +// 1/d). This lets us easily check our work. +// +// Suppose that we want to solve Ax=b for b = [1 2 ... n]^T. +// For c = Ux, first solve Lc = b: +// +// c = [1, 2, ..., n-1, n - n(n-1)/(2d)]^T +// +// and then solve Ux = c. First, +// +// x_n = c_n / (d - (n-1)/d). +// +// Then, for k = 1, ..., n-1, dx_k + x_n = k, so +// +// x_k = (k - x_n) / d, for k = 1, ..., n-1. +// +// Now, multiply b through by d - (n-1)/d. This completely avoids +// rounding error, as long as no quantities overflow. To get the +// right answer, multiply both c and x through by the same scaling +// factor. + +template +void +testArrowMatrixWithDense (bool& success, Teuchos::FancyOStream& out, const LO lclNumRows) +{ + typedef typename Kokkos::Details::ArithTraits::val_type val_type; + typedef typename Kokkos::Details::ArithTraits::mag_type mag_type; + typedef typename Kokkos::View::HostMirror::execution_space host_execution_space; + typedef Kokkos::HostSpace host_memory_space; + typedef Kokkos::Device HDT; + + Teuchos::OSTab tab0 (out); + out << "Test arrow matrix problem using dense matrices" << endl; + Teuchos::OSTab tab1 (out); + + const LO lclNumCols = lclNumRows; + out << "Test with " << lclNumRows << " x " << lclNumCols << " matrices" << endl; + + // Kokkos Views fill with zeros by default. + Kokkos::View A ("A", lclNumRows, lclNumCols); + Kokkos::View L ("L", lclNumRows, lclNumCols); + Kokkos::View U ("U", lclNumRows, lclNumCols); + + const val_type ZERO = Kokkos::Details::ArithTraits::zero (); + const val_type ONE = Kokkos::Details::ArithTraits::one (); + const val_type TWO = ONE + ONE; + const val_type N = static_cast (static_cast (lclNumRows)); + const val_type d = TWO * N; + + out << "Construct test problem (d = " << d << ")" << endl; + + for (LO i = 0; i < lclNumRows; ++i) { + A(i, i) = d; + if (i + 1 < lclNumRows) { + A(i, lclNumCols-1) = ONE; + } + else if (i + 1 == lclNumRows) { + for (LO j = 0; j + 1 < lclNumCols; ++j) { + A(i, j) = ONE; + } + } + } + + for (LO i = 0; i < lclNumRows; ++i) { + L(i, i) = ONE; + if (i + 1 == lclNumRows) { + for (LO j = 0; j + 1 < lclNumCols; ++j) { + L(i, j) = ONE / d; + } + } + } + + for (LO i = 0; i < lclNumRows; ++i) { + if (i + 1 < lclNumRows) { + U(i, i) = d; + U(i, lclNumCols-1) = ONE; + } + else if (i + 1 == lclNumRows) { + U(i, i) = d - (N - ONE) / d; + } + } + + out << "Use dense matrix-matrix multiply to check that A == L*U" << endl; + + Kokkos::View A_copy ("A_copy", lclNumRows, lclNumCols); + Tpetra::Experimental::GEMM ("N", "N", ONE, L, U, ZERO, A_copy); + for (LO i = 0; i < lclNumRows; ++i) { + out << "Row " << i << endl; + for (LO j = 0; j < lclNumCols; ++j) { + TEST_EQUALITY( A(i,j), A_copy(i,j) ); + } + } + + out << "Check that the LU factorization of A is LU" << endl; + + Kokkos::deep_copy (A_copy, A); + Kokkos::View ipiv ("ipiv", lclNumRows); + int info = 0; + Tpetra::Experimental::GETF2 (A_copy, ipiv, info); + TEST_EQUALITY( info, 0 ); + + for (LO i = 0; i < lclNumRows; ++i) { + out << "Row " << i << endl; + for (LO j = 0; j < lclNumCols; ++j) { + if (j < i) { // lower triangle (L) part of result + TEST_EQUALITY( L(i,j), A_copy(i,j) ); + } + else { // upper triangle (U) part of result + TEST_EQUALITY( U(i,j), A_copy(i,j) ); + } + } + } + for (LO i = 0; i < lclNumRows; ++i) { + // LAPACK pivots are one-based. + TEST_EQUALITY( static_cast (ipiv[i]), i + static_cast (1) ); + } + + // Test our exact solution to Ax=b. + Kokkos::View x ("x", lclNumCols); + Kokkos::View c ("c", lclNumRows); + Kokkos::View b ("b", lclNumRows); + + const val_type scalingFactor = d - (N - ONE) / d; + for (LO i = 0; i < lclNumRows; ++i) { + b(i) = scalingFactor * static_cast (static_cast (i+1)); + //b(i) = static_cast (static_cast (i+1)); + } + // GETRS overwrites the input right-hand side with the solution. + Kokkos::deep_copy (x, b); + + Tpetra::Experimental::GETRS ("N", A_copy, ipiv, x, info); + TEST_EQUALITY( info, 0 ); + + const val_type c_n_unscaled_expected = N - ((N - ONE)*N) / (TWO * d); + const val_type c_n_expected = scalingFactor * c_n_unscaled_expected; + const val_type x_n_unscaled_expected = c_n_unscaled_expected / scalingFactor; + const val_type x_n_expected = c_n_unscaled_expected; + + //TEST_EQUALITY( x(lclNumRows-1), x_n_unscaled_expected ); + TEST_EQUALITY( x(lclNumRows-1), x_n_expected ); + for (LO i = 0; i + 1 < lclNumRows; ++i) { + const val_type K = static_cast (static_cast (i+1)); + //const val_type x_i_unscaled_expected = (K - x_n_unscaled_expected) / d; + //TEST_EQUALITY( x(i), x_i_unscaled_expected ); + + const val_type x_i_expected = (scalingFactor * (K - x_n_unscaled_expected)) / d; + TEST_EQUALITY( x(i), x_i_expected ); + } + + // Now repeat the test to see if we can do a triangular solve with + // L. We do this to test whether solving Lc = b works correctly. + Kokkos::deep_copy (x, b); + Kokkos::deep_copy (c, ZERO); + + // Compute c, the result of solving Lc = b + c(0) = b(0); + for (LO i = 1; i < lclNumRows; ++i) { + val_type c_i = b(i); + for (LO j = 0; j < i; ++j) { + c_i = c_i - L(i,j) * c(j); + } + c(i) = c_i / L(i,i); + } + + // Test the resulting vector c + for (LO i = 0; i + 1 < lclNumRows; ++i) { + const val_type K = static_cast (static_cast (i+1)); + const val_type c_i_unscaled_expected = K; + const val_type c_i_expected = scalingFactor * K; + TEST_EQUALITY( c(i), c_i_expected ); + } + TEST_EQUALITY( c(lclNumRows-1), c_n_expected ); +} + +template::scalar_type, + class LO = Tpetra::Vector<>::local_ordinal_type, + class GO = Tpetra::Vector<>::global_ordinal_type> +void testArrowMatrix (bool& success, Teuchos::FancyOStream& out) +{ + typedef Tpetra::Map map_type; + typedef typename map_type::device_type device_type; + typedef Tpetra::CrsMatrix crs_matrix_type; + typedef Tpetra::RowMatrix row_matrix_type; + typedef Tpetra::Vector vec_type; + typedef Ifpack2::LocalSparseTriangularSolver solver_type; + typedef Kokkos::Details::ArithTraits KAT; + typedef typename KAT::val_type IST; + typedef typename KAT::mag_type mag_type; + int lclSuccess = 1; + int gblSuccess = 1; + + const bool explicitlyStoreUnitDiagonalOfL = false; + + Teuchos::OSTab tab0 (out); + out << "Ifpack2::LocalSparseTriangularSolver: Test with arrow matrix" << endl; + Teuchos::OSTab tab1 (out); + + auto comm = Tpetra::DefaultPlatform::getDefaultPlatform ().getComm (); + + const LO lclNumRows = 8; // power of two (see above) + const LO lclNumCols = lclNumRows; + const GO gblNumRows = comm->getSize () * lclNumRows; + const GO indexBase = 0; + RCP rowMap = + rcp (new map_type (static_cast (gblNumRows), + static_cast (lclNumRows), + indexBase, comm)); + + // At this point, we know Kokkos has been initialized, so test the + // dense version of the problem. + testArrowMatrixWithDense (success, out, lclNumRows); + + // If we construct an upper or lower triangular matrix with an + // implicit unit diagonal, then we need to specify the column Map + // explicitly. Otherwise, the matrix will report having the wrong + // number of columns. In this case, the local matrix is square and + // every column is populated, so we can set column Map = row Map. + RCP colMap = rowMap; + RCP domMap = rowMap; + RCP ranMap = rowMap; + + typedef typename crs_matrix_type::local_graph_type local_graph_type; + typedef typename crs_matrix_type::local_matrix_type local_matrix_type; + typedef typename local_matrix_type::row_map_type::non_const_type row_offsets_type; + typedef typename local_graph_type::entries_type::non_const_type col_inds_type; + typedef typename local_matrix_type::values_type::non_const_type values_type; + + // + // The suffix _d here stands for (GPU) "device," and the suffix _h + // stands for (CPU) "host." + // + + row_offsets_type L_ptr_d ("ptr", lclNumRows + 1); + auto L_ptr_h = Kokkos::create_mirror_view (L_ptr_d); + row_offsets_type U_ptr_d ("ptr", lclNumRows + 1); + auto U_ptr_h = Kokkos::create_mirror_view (U_ptr_d); + + // The local number of _entries_ could in theory require 64 bits + // even if LO is 32 bits. This example doesn't require it, but why + // not be general if there is no serious cost? We use ptrdiff_t + // because it is signed. + const ptrdiff_t L_lclNumEnt = explicitlyStoreUnitDiagonalOfL ? + (2*lclNumRows - 1) : + (lclNumRows - 1); + const ptrdiff_t U_lclNumEnt = 2*lclNumRows - 1; + + col_inds_type L_ind_d ("ind", L_lclNumEnt); + auto L_ind_h = Kokkos::create_mirror_view (L_ind_d); + values_type L_val_d ("val", L_lclNumEnt); + auto L_val_h = Kokkos::create_mirror_view (L_val_d); + + col_inds_type U_ind_d ("ind", U_lclNumEnt); + auto U_ind_h = Kokkos::create_mirror_view (U_ind_d); + values_type U_val_d ("val", U_lclNumEnt); + auto U_val_h = Kokkos::create_mirror_view (U_val_d); + + const IST ONE = KAT::one (); + const IST TWO = KAT::one () + KAT::one (); + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST N = static_cast (static_cast (lclNumRows)); + const IST d = TWO * N; + + ptrdiff_t L_curPos = 0; + for (LO i = 0; i < lclNumRows; ++i) { + L_ptr_h[i] = L_curPos; + + if (i + 1 == lclNumRows) { + // Last row: Add the off-diagonal entries + for (LO j = 0; j + 1 < lclNumCols; ++j) { + L_ind_h[L_curPos] = j; + L_val_h[L_curPos] = ONE / d; + ++L_curPos; + } + } + if (explicitlyStoreUnitDiagonalOfL) { + // Add the diagonal entry + L_ind_h[L_curPos] = i; + L_val_h[L_curPos] = ONE; + ++L_curPos; + } + } + L_ptr_h[lclNumRows] = L_curPos; + + ptrdiff_t U_curPos = 0; + for (LO i = 0; i < lclNumRows; ++i) { + U_ptr_h[i] = U_curPos; + + if (i + 1 < lclNumRows) { + // Add the diagonal entry (first in the row) + U_ind_h[U_curPos] = i; + U_val_h[U_curPos] = d; + ++U_curPos; + + // Add the last entry in the row + U_ind_h[U_curPos] = lclNumCols - 1; + U_val_h[U_curPos] = ONE; + ++U_curPos; + } + else if (i + 1 == lclNumRows) { + // Add the last row's diagonal entry (only entry in this row) + U_ind_h[U_curPos] = lclNumCols - 1; + U_val_h[U_curPos] = d - (N - ONE) / d; + ++U_curPos; + } + } + U_ptr_h[lclNumRows] = U_curPos; + + // Make sure that we counted the number of entries correctly. + TEST_ASSERT( L_curPos == L_lclNumEnt ); + TEST_ASSERT( U_curPos == U_lclNumEnt ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + Kokkos::deep_copy (L_ptr_d, L_ptr_h); + Kokkos::deep_copy (L_ind_d, L_ind_h); + Kokkos::deep_copy (L_val_d, L_val_h); + + Kokkos::deep_copy (U_ptr_d, U_ptr_h); + Kokkos::deep_copy (U_ind_d, U_ind_h); + Kokkos::deep_copy (U_val_d, U_val_h); + + out << "Create the lower triangular Tpetra::CrsMatrix L" << endl; + RCP L; + TEST_NOTHROW( L = rcp (new crs_matrix_type (rowMap, colMap, L_ptr_d, L_ind_d, L_val_d)) ); + TEST_ASSERT( ! L.is_null () ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + out << "Call fillComplete on the lower triangular matrix L" << endl; + TEST_NOTHROW( L->fillComplete (domMap, ranMap) ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + out << "Make sure that the last row of L is correct" << endl; + { + Teuchos::OSTab tab2 (out); + + // FIXME (mfh 23 Aug 2016) This may depend on UVM. + // We should instead rely on dual view semantics here. + Teuchos::ArrayView lclColInds; + Teuchos::ArrayView vals; + + L->getLocalRowView (lclNumRows - 1, lclColInds, vals); + if (explicitlyStoreUnitDiagonalOfL) { + TEST_EQUALITY( static_cast (lclColInds.size ()), lclNumCols ); + TEST_EQUALITY( static_cast (vals.size ()), lclNumCols ); + } + else { + TEST_EQUALITY( static_cast (lclColInds.size ()), + lclNumCols - static_cast (1) ); + TEST_EQUALITY( static_cast (vals.size ()), + lclNumCols - static_cast (1) ); + } + if (success) { + // FIXME (mfh 23 Aug 2016) This depends on the entries being + // sorted. They should be, since they were sorted on input, + // using a KokkosSparse::CrsMatrix input. + for (LO j = 0; j + 1 < lclNumCols; ++j) { + TEST_EQUALITY( lclColInds[j], j ); + TEST_EQUALITY( static_cast (vals[j]), ONE / d ); + } + if (explicitlyStoreUnitDiagonalOfL) { + TEST_EQUALITY( lclColInds[lclNumCols-1], lclNumCols - 1 ); + TEST_EQUALITY( static_cast (vals[lclNumCols-1]), ONE ); + } + } + } + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + out << "Create the upper triangular Tpetra::CrsMatrix U" << endl; + RCP U; + TEST_NOTHROW( U = rcp (new crs_matrix_type (rowMap, colMap, U_ptr_d, U_ind_d, U_val_d)) ); + TEST_ASSERT( ! U.is_null () ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + out << "Call fillComplete on the upper triangular matrix U" << endl; + TEST_NOTHROW( U->fillComplete (domMap, ranMap) ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // output argument + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + out << "Create the solver for L" << endl; + RCP L_solver; + TEST_NOTHROW( L_solver = rcp (new solver_type (L, Teuchos::rcpFromRef (out))) ); + TEST_ASSERT( ! L_solver.is_null () ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // to be revised + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + out << "Set up the solver for L" << endl; + TEST_NOTHROW( L_solver->initialize () ); + if (success) { + TEST_NOTHROW( L_solver->compute () ); + } + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // to be revised + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + out << "Create the solver for U" << endl; + RCP U_solver; + TEST_NOTHROW( U_solver = rcp (new solver_type (U, Teuchos::rcpFromRef (out))) ); + TEST_ASSERT( ! U_solver.is_null () ); + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // to be revised + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + out << "Set up the solver for U" << endl; + TEST_NOTHROW( U_solver->initialize () ); + if (success) { + TEST_NOTHROW( U_solver->compute () ); + } + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // to be revised + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (! gblSuccess) { + out << "Aborting test" << endl; + return; + } + + const IST scalingFactor = d - (N - ONE) / d; + const bool scaleProblem = true;//false; + + // Set up the right-hand side b. + vec_type b (ranMap); + { + b.template sync (); + b.template modify (); + auto b_lcl_2d = b.template getLocalView (); + auto b_lcl_1d = Kokkos::subview (b_lcl_2d, Kokkos::ALL (), 0); + + for (LO i = 0; i < lclNumRows; ++i) { + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST K = static_cast (static_cast (i+1)); + if (scaleProblem) { + b_lcl_1d(i) = scalingFactor * K; + } + else { + b_lcl_1d(i) = K; + } + } + b.template sync (); + } + + // We solve Ax=b (with A = LU) by first solving Lc = b, and then + // solving Ux = c. Thus, c's Map is the same as U's range Map. + vec_type c (U->getRangeMap ()); + vec_type x (domMap); + + out << "Solve Lc = b for c" << endl; + int lclNotThrew = 1; // to be set below + int gblNotThrew = 0; // output argument + { + std::ostringstream errStrm; + bool threw = false; + try { + L_solver->apply (b, c); + } + catch (std::exception& e) { + errStrm << "L_solver->apply(b,c) threw an exception: " << e.what () << std::endl; + threw = true; + } + catch (...) { + errStrm << "L_solver->apply(b,c) threw an exception not a subclass of " + "std::exception" << std::endl; + threw = true; + } + lclNotThrew = threw ? 0 : 1; + reduceAll (*comm, REDUCE_MIN, lclNotThrew, outArg (gblNotThrew)); + TEST_EQUALITY( gblNotThrew, 1 ); + if (gblNotThrew != 1) { + Tpetra::Details::gathervPrint (out, errStrm.str (), *comm); + } + } + if (gblNotThrew != 1) { + out << "Aborting test" << endl; + return; + } + + // Test the entries of c for correctness. These are EXACT tests, + // which we may do since the solves should not have committed any + // rounding error. See discussion above. + // + // NOTE (mfh 21 Aug 2016) This won't work if we accept approximate + // sparse triangular solves. + + const IST c_n_unscaled_expected = N - ((N - ONE)*N) / (TWO * d); + const IST c_n_expected = scaleProblem ? + (scalingFactor * c_n_unscaled_expected) : + c_n_unscaled_expected; + const IST x_n_unscaled_expected = c_n_unscaled_expected / scalingFactor; + const IST x_n_expected = scaleProblem ? c_n_unscaled_expected : x_n_unscaled_expected; + + out << "Test entries of c (solution of Lc=b)" << endl; + { + Teuchos::OSTab tab2 (out); + + c.template sync (); + auto c_lcl_2d = c.template getLocalView (); + auto c_lcl_1d = Kokkos::subview (c_lcl_2d, Kokkos::ALL (), 0); + + for (LO i = 0; i + 1 < lclNumRows; ++i) { + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST K = static_cast (static_cast (i+1)); + const IST c_i_expected = scaleProblem ? (scalingFactor * K) : K; + TEST_EQUALITY( c_lcl_1d(i), c_i_expected ); + } + TEST_EQUALITY( c_lcl_1d(lclNumRows-1), c_n_expected ); + c.template sync (); + } + // lclSuccess = success ? 1 : 0; + // gblSuccess = 0; // to be revised + // reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + // TEST_EQUALITY( gblSuccess, 1 ); + // if (! gblSuccess) { + // out << "Aborting test" << endl; + // return; + // } + + out << "Solve Ux = c for x" << endl; + TEST_NOTHROW( U_solver->apply (c, x) ); + // lclSuccess = success ? 1 : 0; + // gblSuccess = 0; // to be revised + // reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + // TEST_EQUALITY( gblSuccess, 1 ); + // if (! gblSuccess) { + // out << "Aborting test" << endl; + // return; + // } + + // Test the entries of x for correctness. These are EXACT tests, + // which we may do since the solves should not have committed any + // rounding error. See discussion above. + // + // NOTE (mfh 21 Aug 2016) This won't work if we accept approximate + // sparse triangular solves. + + out << "Test entries of x (solution of Ux=c)" << endl; + { + Teuchos::OSTab tab2 (out); + + x.template sync (); + auto x_lcl_2d = x.template getLocalView (); + auto x_lcl_1d = Kokkos::subview (x_lcl_2d, Kokkos::ALL (), 0); + + for (LO i = 0; i + 1 < lclNumRows; ++i) { + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST K = static_cast (static_cast (i+1)); + const IST x_i_expected = scaleProblem ? + ((scalingFactor * (K - x_n_unscaled_expected)) / d) : + ((K - x_n_unscaled_expected) / d); + TEST_EQUALITY( x_lcl_1d(i), x_i_expected ); + } + TEST_EQUALITY( x_lcl_1d(lclNumRows-1), x_n_expected ); + } + + out << "Test against a reference sparse triangular solver" << endl; + + c.putScalar (Teuchos::ScalarTraits::zero ()); + x.putScalar (Teuchos::ScalarTraits::zero ()); + + const std::string unitDiagL = explicitlyStoreUnitDiagonalOfL ? + "No unit diagonal" : "Unit diagonal"; + TRSV (c, *L, b, "Lower triangular", "No transpose", + unitDiagL.c_str ()); + out << "Test entries of c (solution of Lc=b)" << endl; + { + Teuchos::OSTab tab2 (out); + + c.template sync (); + auto c_lcl_2d = c.template getLocalView (); + auto c_lcl_1d = Kokkos::subview (c_lcl_2d, Kokkos::ALL (), 0); + + for (LO i = 0; i + 1 < lclNumRows; ++i) { + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST K = static_cast (static_cast (i+1)); + const IST c_i_expected = scaleProblem ? (scalingFactor * K) : K; + TEST_EQUALITY( c_lcl_1d(i), c_i_expected ); + } + TEST_EQUALITY( c_lcl_1d(lclNumRows-1), c_n_expected ); + c.template sync (); + } + + TRSV (x, *U, c, "Upper triangular", "No transpose", + "No unit diagonal"); + out << "Test entries of x (solution of Ux=c)" << endl; + { + Teuchos::OSTab tab2 (out); + + x.template sync (); + auto x_lcl_2d = x.template getLocalView (); + auto x_lcl_1d = Kokkos::subview (x_lcl_2d, Kokkos::ALL (), 0); + + for (LO i = 0; i + 1 < lclNumRows; ++i) { + // Don't cast directly from an integer type to IST, + // since if IST is complex, that cast may not exist. + const IST K = static_cast (static_cast (i+1)); + const IST x_i_expected = scaleProblem ? + ((scalingFactor * (K - x_n_unscaled_expected)) / d) : + ((K - x_n_unscaled_expected) / d); + TEST_EQUALITY( x_lcl_1d(i), x_i_expected ); + } + TEST_EQUALITY( x_lcl_1d(lclNumRows-1), x_n_expected ); + } + + // Make sure that all processes succeeded. + lclSuccess = success ? 1 : 0; + gblSuccess = 0; // to be revised + reduceAll (*comm, REDUCE_MIN, lclSuccess, outArg (gblSuccess)); + TEST_EQUALITY( gblSuccess, 1 ); + if (gblSuccess != 1) { + out << "The test FAILED on at least one MPI process." << endl; + } +} + +// This test is really only useful for Scalar = double. +#ifdef HAVE_TPETRA_INST_DOUBLE +TEUCHOS_UNIT_TEST(LocalSparseTriangularSolver, ArrowMatrix) +{ + testArrowMatrix (success, out); +} +#endif // HAVE_TPETRA_INST_DOUBLE + #define UNIT_TEST_GROUP_SC_LO_GO(SC, LO, GO) \ TEUCHOS_UNIT_TEST_TEMPLATE_3_INSTANT(LocalSparseTriangularSolver, CompareToLocalSolve, SC, LO, GO)