/* //@HEADER // ************************************************************************ // // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation // // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. Neither the name of the Corporation nor the names of the // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact Christian R. Trott (crtrott@sandia.gov) // // ************************************************************************ //@HEADER */ namespace Test { template< class DeviceType , typename ScalarType = double , typename TestLayout = Kokkos::LayoutRight > struct MultiDimRangePerf3D { typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; using iterate_type = Kokkos::Iterate; typedef Kokkos::View view_type; typedef typename view_type::HostMirror host_view_type; view_type A; view_type B; const long irange; const long jrange; const long krange; MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_) : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long i, const long j, const long k) const { A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k) + B(i,j+2,k) + B(i,j+1,k) + B(i,j,k+2) + B(i,j,k+1) + B(i,j,k) ); } struct InitZeroTag {}; // struct InitViewTag {}; struct Init { Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_) : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long i, const long j, const long k) const { input(i,j,k) = 1.0; } KOKKOS_INLINE_FUNCTION void operator()(const InitZeroTag&, const long i, const long j, const long k) const { input(i,j,k) = 0; } view_type input; const long irange; const long jrange; const long krange; }; static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1) { //This test performs multidim range over all dims view_type Atest("Atest", icount, jcount, kcount); view_type Btest("Btest", icount+2, jcount+2, kcount+2); typedef MultiDimRangePerf3D FunctorType; double dt_min = 0; // LayoutRight if ( std::is_same::value ) { Kokkos::MDRangePolicy, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); Kokkos::MDRangePolicy, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); typedef typename Kokkos::MDRangePolicy, execution_space > MDRangeType; using tile_type = typename MDRangeType::tile_type; using point_type = typename MDRangeType::point_type; Kokkos::MDRangePolicy, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} ); Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) ); execution_space().fence(); Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) ); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) ); execution_space().fence(); const double dt = timer.seconds(); if ( 0 == i ) dt_min = dt ; else dt_min = dt < dt_min ? dt : dt_min ; //Correctness check - only the first run if ( 0 == i ) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise answer for ( long l = 0; l < static_cast(icount); ++l ) { for ( long j = 0; j < static_cast(jcount); ++j ) { for ( long k = 0; k < static_cast(kcount); ++k ) { ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k) + Bhost(l,j+2,k) + Bhost(l,j+1,k) + Bhost(l,j,k+2) + Bhost(l,j,k+1) + Bhost(l,j,k) ); if ( Ahost(l,j,k) - check != 0 ) { ++numErrors; std::cout << " Correctness error at index: " << l << ","<, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}}); Kokkos::MDRangePolicy, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}}); //typedef typename Kokkos::MDRangePolicy, execution_space > MDRangeType; //using tile_type = typename MDRangeType::tile_type; //using point_type = typename MDRangeType::point_type; //Kokkos::MDRangePolicy, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} ); Kokkos::MDRangePolicy, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} ); Kokkos::parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) ); execution_space().fence(); Kokkos::parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) ); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) ); execution_space().fence(); const double dt = timer.seconds(); if ( 0 == i ) dt_min = dt ; else dt_min = dt < dt_min ? dt : dt_min ; //Correctness check - only the first run if ( 0 == i ) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise answer for ( long l = 0; l < static_cast(icount); ++l ) { for ( long j = 0; j < static_cast(jcount); ++j ) { for ( long k = 0; k < static_cast(kcount); ++k ) { ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k) + Bhost(l,j+2,k) + Bhost(l,j+1,k) + Bhost(l,j,k+2) + Bhost(l,j,k+1) + Bhost(l,j,k) ); if ( Ahost(l,j,k) - check != 0 ) { ++numErrors; std::cout << " Correctness error at index: " << l << ","< struct RangePolicyCollapseTwo { // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; typedef TestLayout layout; using iterate_type = Kokkos::Iterate; typedef Kokkos::View view_type; typedef typename view_type::HostMirror host_view_type; view_type A; view_type B; const long irange; const long jrange; const long krange; RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_) : A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if ( std::is_same::value ) { //id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r //r = j + i*Nj long i = int(r / jrange); long j = int( r - i*jrange); for (int k = 0; k < krange; ++k) { A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k) + B(i,j+2,k) + B(i,j+1,k) + B(i,j,k+2) + B(i,j,k+1) + B(i,j,k) ); } } else if ( std::is_same::value ) { //id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r //r = j + k*Nj long k = int(r / jrange); long j = int( r - k*jrange); for (int i = 0; i < irange; ++i) { A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k) + B(i,j+2,k) + B(i,j+1,k) + B(i,j,k+2) + B(i,j,k+1) + B(i,j,k) ); } } } struct Init { view_type input; const long irange; const long jrange; const long krange; Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_) : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if ( std::is_same::value ) { long i = int(r / jrange); long j = int( r - i*jrange); for (int k = 0; k < krange; ++k) { input(i,j,k) = 1; } } else if ( std::is_same::value ) { long k = int(r / jrange); long j = int( r - k*jrange); for (int i = 0; i < irange; ++i) { input(i,j,k) = 1; } } } }; static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1) { // This test refers to collapsing two dims while using the RangePolicy view_type Atest("Atest", icount, jcount, kcount); view_type Btest("Btest", icount+2, jcount+2, kcount+2); typedef RangePolicyCollapseTwo FunctorType; long collapse_index_rangeA = 0; long collapse_index_rangeB = 0; if ( std::is_same::value ) { collapse_index_rangeA = icount*jcount; collapse_index_rangeB = (icount+2)*(jcount+2); // std::cout << " LayoutRight " << std::endl; } else if ( std::is_same::value ) { collapse_index_rangeA = kcount*jcount; collapse_index_rangeB = (kcount+2)*(jcount+2); // std::cout << " LayoutLeft " << std::endl; } else { std::cout << " LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl; exit(-1); } Kokkos::RangePolicy policy(0, (collapse_index_rangeA) ); Kokkos::RangePolicy policy_initB(0, (collapse_index_rangeB) ); double dt_min = 0; Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) ); execution_space().fence(); Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) ); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount)); execution_space().fence(); const double dt = timer.seconds(); if ( 0 == i ) dt_min = dt ; else dt_min = dt < dt_min ? dt : dt_min ; //Correctness check - first iteration only if ( 0 == i ) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise answer for ( long l = 0; l < static_cast(icount); ++l ) { for ( long j = 0; j < static_cast(jcount); ++j ) { for ( long k = 0; k < static_cast(kcount); ++k ) { ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k) + Bhost(l,j+2,k) + Bhost(l,j+1,k) + Bhost(l,j,k+2) + Bhost(l,j,k+1) + Bhost(l,j,k) ); if ( Ahost(l,j,k) - check != 0 ) { ++numErrors; std::cout << " Correctness error at index: " << l << ","< struct RangePolicyCollapseAll { // RangePolicy for 3D range, but will collapse all dims typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; typedef TestLayout layout; typedef Kokkos::View view_type; typedef typename view_type::HostMirror host_view_type; view_type A; view_type B; const long irange; const long jrange; const long krange; RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_) : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if ( std::is_same::value ) { long i = int(r / (jrange*krange)); long j = int(( r - i*jrange*krange)/krange); long k = int(r - i*jrange*krange - j*krange); A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k) + B(i,j+2,k) + B(i,j+1,k) + B(i,j,k+2) + B(i,j,k+1) + B(i,j,k) ); } else if ( std::is_same::value ) { long k = int(r / (irange*jrange)); long j = int(( r - k*irange*jrange)/irange); long i = int(r - k*irange*jrange - j*irange); A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k) + B(i,j+2,k) + B(i,j+1,k) + B(i,j,k+2) + B(i,j,k+1) + B(i,j,k) ); } } struct Init { view_type input; const long irange; const long jrange; const long krange; Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_) : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} KOKKOS_INLINE_FUNCTION void operator()(const long r) const { if ( std::is_same::value ) { long i = int(r / (jrange*krange)); long j = int(( r - i*jrange*krange)/krange); long k = int(r - i*jrange*krange - j*krange); input(i,j,k) = 1; } else if ( std::is_same::value ) { long k = int(r / (irange*jrange)); long j = int(( r - k*irange*jrange)/irange); long i = int(r - k*irange*jrange - j*irange); input(i,j,k) = 1; } } }; static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1) { //This test refers to collapsing all dims using the RangePolicy view_type Atest("Atest", icount, jcount, kcount); view_type Btest("Btest", icount+2, jcount+2, kcount+2); typedef RangePolicyCollapseAll FunctorType; const long flat_index_range = icount*jcount*kcount; Kokkos::RangePolicy policy(0, flat_index_range ); Kokkos::RangePolicy policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) ); double dt_min = 0; Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) ); execution_space().fence(); Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) ); execution_space().fence(); for (int i = 0; i < iter; ++i) { Kokkos::Timer timer; Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount)); execution_space().fence(); const double dt = timer.seconds(); if ( 0 == i ) dt_min = dt ; else dt_min = dt < dt_min ? dt : dt_min ; //Correctness check - first iteration only if ( 0 == i ) { long numErrors = 0; host_view_type Ahost("Ahost", icount, jcount, kcount); Kokkos::deep_copy(Ahost, Atest); host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2); Kokkos::deep_copy(Bhost, Btest); // On KNL, this may vectorize - add print statement to prevent // Also, compare against epsilon, as vectorization can change bitwise answer for ( long l = 0; l < static_cast(icount); ++l ) { for ( long j = 0; j < static_cast(jcount); ++j ) { for ( long k = 0; k < static_cast(kcount); ++k ) { ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k) + Bhost(l,j+2,k) + Bhost(l,j+1,k) + Bhost(l,j,k+2) + Bhost(l,j,k+1) + Bhost(l,j,k) ); if ( Ahost(l,j,k) - check != 0 ) { ++numErrors; std::cout << " Callapse ALL Correctness error at index: " << l << ","<