/* //@HEADER // ************************************************************************ // // Kokkos v. 2.0 // Copyright (2014) Sandia Corporation // // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, // the U.S. Government retains certain rights in this software. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are // met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // // 3. Neither the name of the Corporation nor the names of the // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // // Questions? Contact Christian R. Trott (crtrott@sandia.gov) // // ************************************************************************ //@HEADER */ #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP #define KOKKOS_EXAMPLE_FENLFUNCTORS_HPP #include #include #include #include #include #include #include #include #include #include #include #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- namespace Kokkos { namespace Example { namespace FENL { template< class ElemNodeIdView , class CrsGraphType , unsigned ElemNode > class NodeNodeGraph { public: typedef typename ElemNodeIdView::execution_space execution_space ; typedef pair key_type ; typedef Kokkos::UnorderedMap< key_type, void , execution_space > SetType ; typedef typename CrsGraphType::row_map_type::non_const_type RowMapType ; typedef Kokkos::View< unsigned , execution_space > UnsignedValue ; // Static dimensions of 0 generate compiler warnings or errors. typedef Kokkos::View< unsigned*[ElemNode][ElemNode] , execution_space > ElemGraphType ; struct TagFillNodeSet {}; struct TagScanNodeCount {}; struct TagFillGraphEntries {}; struct TagSortGraphEntries {}; struct TagFillElementGraph {}; private: enum PhaseType { FILL_NODE_SET , SCAN_NODE_COUNT , FILL_GRAPH_ENTRIES , SORT_GRAPH_ENTRIES , FILL_ELEMENT_GRAPH }; const unsigned node_count ; const ElemNodeIdView elem_node_id ; UnsignedValue row_total ; RowMapType row_count ; RowMapType row_map ; SetType node_node_set ; PhaseType phase ; public: CrsGraphType graph ; ElemGraphType elem_graph ; struct Times { double ratio; double fill_node_set; double scan_node_count; double fill_graph_entries; double sort_graph_entries; double fill_element_graph; }; NodeNodeGraph( const ElemNodeIdView & arg_elem_node_id , const unsigned arg_node_count, Times & results ) : node_count(arg_node_count) , elem_node_id( arg_elem_node_id ) , row_total( "row_total" ) , row_count(Kokkos::ViewAllocateWithoutInitializing("row_count") , node_count ) // will deep_copy to 0 inside loop , row_map( "graph_row_map" , node_count + 1 ) , node_node_set() , phase( FILL_NODE_SET ) , graph() , elem_graph() { //-------------------------------- // Guess at capacity required for the map: Kokkos::Timer wall_clock ; wall_clock.reset(); phase = FILL_NODE_SET ; // upper bound on the capacity size_t set_capacity = (28ull * node_count) / 2; unsigned failed_insert_count = 0 ; do { // Zero the row count to restart the fill Kokkos::deep_copy( row_count , 0u ); node_node_set = SetType( ( set_capacity += failed_insert_count ) ); // May be larger that requested: set_capacity = node_node_set.capacity(); Kokkos::parallel_reduce( Kokkos::RangePolicy(0,elem_node_id.extent(0)) , *this , failed_insert_count ); } while ( failed_insert_count ); execution_space().fence(); results.ratio = (double)node_node_set.size() / (double)node_node_set.capacity(); results.fill_node_set = wall_clock.seconds(); //-------------------------------- wall_clock.reset(); phase = SCAN_NODE_COUNT ; // Exclusive scan of row_count into row_map // including the final total in the 'node_count + 1' position. // Zero the 'row_count' values. Kokkos::parallel_scan( node_count , *this ); // Zero the row count for the fill: Kokkos::deep_copy( row_count , 0u ); unsigned graph_entry_count = 0 ; Kokkos::deep_copy( graph_entry_count , row_total ); // Assign graph's row_map and allocate graph's entries graph.row_map = row_map ; graph.entries = typename CrsGraphType::entries_type( "graph_entries" , graph_entry_count ); //-------------------------------- // Fill graph's entries from the (node,node) set. execution_space().fence(); results.scan_node_count = wall_clock.seconds(); wall_clock.reset(); phase = FILL_GRAPH_ENTRIES ; Kokkos::parallel_for( node_node_set.capacity() , *this ); execution_space().fence(); results.fill_graph_entries = wall_clock.seconds(); //-------------------------------- // Done with the temporary sets and arrays wall_clock.reset(); phase = SORT_GRAPH_ENTRIES ; row_total = UnsignedValue(); row_count = RowMapType(); row_map = RowMapType(); node_node_set.clear(); //-------------------------------- Kokkos::parallel_for( node_count , *this ); execution_space().fence(); results.sort_graph_entries = wall_clock.seconds(); //-------------------------------- // Element-to-graph mapping: wall_clock.reset(); phase = FILL_ELEMENT_GRAPH ; elem_graph = ElemGraphType("elem_graph", elem_node_id.extent(0) ); Kokkos::parallel_for( elem_node_id.extent(0) , *this ); execution_space().fence(); results.fill_element_graph = wall_clock.seconds(); } //------------------------------------ // parallel_for: create map and count row length KOKKOS_INLINE_FUNCTION void operator()( const TagFillNodeSet & , unsigned ielem , unsigned & count ) const { // Loop over element's (row_local_node,col_local_node) pairs: for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { const unsigned row_node = elem_node_id( ielem , row_local_node ); for ( unsigned col_local_node = row_local_node ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) { const unsigned col_node = elem_node_id( ielem , col_local_node ); // If either node is locally owned then insert the pair into the unordered map: if ( row_node < row_count.extent(0) || col_node < row_count.extent(0) ) { const key_type key = (row_node < col_node) ? make_pair( row_node, col_node ) : make_pair( col_node, row_node ) ; const typename SetType::insert_result result = node_node_set.insert( key ); // A successfull insert: the first time this pair was added if ( result.success() ) { // If row node is owned then increment count if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); } // If column node is owned and not equal to row node then increment count if ( col_node < row_count.extent(0) && col_node != row_node ) { atomic_fetch_add( & row_count( col_node ) , 1 ); } } else if ( result.failed() ) { ++count ; } } } } } KOKKOS_INLINE_FUNCTION void fill_graph_entries( const unsigned iset ) const { if ( node_node_set.valid_at(iset) ) { // Add each entry to the graph entries. const key_type key = node_node_set.key_at(iset) ; const unsigned row_node = key.first ; const unsigned col_node = key.second ; if ( row_node < row_count.extent(0) ) { const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 ); graph.entries( offset ) = col_node ; } if ( col_node < row_count.extent(0) && col_node != row_node ) { const unsigned offset = graph.row_map( col_node ) + atomic_fetch_add( & row_count( col_node ) , 1 ); graph.entries( offset ) = row_node ; } } } KOKKOS_INLINE_FUNCTION void sort_graph_entries( const unsigned irow ) const { const unsigned row_beg = graph.row_map( irow ); const unsigned row_end = graph.row_map( irow + 1 ); for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) { const unsigned col = graph.entries(i); unsigned j = i ; for ( ; row_beg < j && col < graph.entries(j-1) ; --j ) { graph.entries(j) = graph.entries(j-1); } graph.entries(j) = col ; } } KOKKOS_INLINE_FUNCTION void fill_elem_graph_map( const unsigned ielem ) const { for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { const unsigned row_node = elem_node_id( ielem , row_local_node ); for ( unsigned col_local_node = 0 ; col_local_node < elem_node_id.extent(1) ; ++col_local_node ) { const unsigned col_node = elem_node_id( ielem , col_local_node ); unsigned entry = ~0u ; if ( row_node + 1 < graph.row_map.extent(0) ) { const unsigned entry_end = graph.row_map( row_node + 1 ); entry = graph.row_map( row_node ); for ( ; entry < entry_end && graph.entries(entry) != col_node ; ++entry ); if ( entry == entry_end ) entry = ~0u ; } elem_graph( ielem , row_local_node , col_local_node ) = entry ; } } } KOKKOS_INLINE_FUNCTION void operator()( const unsigned iwork ) const { /* if ( phase == FILL_NODE_SET ) { operator()( TagFillNodeSet() , iwork ); } else */ if ( phase == FILL_GRAPH_ENTRIES ) { fill_graph_entries( iwork ); } else if ( phase == SORT_GRAPH_ENTRIES ) { sort_graph_entries( iwork ); } else if ( phase == FILL_ELEMENT_GRAPH ) { fill_elem_graph_map( iwork ); } } //------------------------------------ // parallel_scan: row offsets typedef unsigned value_type ; KOKKOS_INLINE_FUNCTION void operator()( const unsigned irow , unsigned & update , const bool final ) const { // exclusive scan if ( final ) { row_map( irow ) = update ; } update += row_count( irow ); if ( final ) { if ( irow + 1 == row_count.extent(0) ) { row_map( irow + 1 ) = update ; row_total() = update ; } } } // For the reduce phase: KOKKOS_INLINE_FUNCTION void init( const TagFillNodeSet & , unsigned & update ) const { update = 0 ; } KOKKOS_INLINE_FUNCTION void join( const TagFillNodeSet & , volatile unsigned & update , volatile const unsigned & input ) const { update += input ; } // For the scan phase:: KOKKOS_INLINE_FUNCTION void init( unsigned & update ) const { update = 0 ; } KOKKOS_INLINE_FUNCTION void join( volatile unsigned & update , volatile const unsigned & input ) const { update += input ; } //------------------------------------ }; } /* namespace FENL */ } /* namespace Example */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- namespace Kokkos { namespace Example { namespace FENL { template< class ElemCompType > class NodeElemGatherFill { public: typedef typename ElemCompType::execution_space execution_space ; typedef typename ElemCompType::vector_type vector_type ; typedef typename ElemCompType::sparse_matrix_type sparse_matrix_type ; typedef typename ElemCompType::elem_node_type elem_node_type ; typedef typename ElemCompType::elem_vectors_type elem_vectors_type ; typedef typename ElemCompType::elem_matrices_type elem_matrices_type ; typedef typename ElemCompType::elem_graph_type elem_graph_type ; static const unsigned ElemNodeCount = ElemCompType::ElemNodeCount ; //------------------------------------ private: typedef Kokkos::StaticCrsGraph< unsigned[2] , execution_space > CrsGraphType ; typedef typename CrsGraphType::row_map_type::non_const_type RowMapType ; typedef Kokkos::View< unsigned , execution_space > UnsignedValue ; enum PhaseType { FILL_NODE_COUNT , SCAN_NODE_COUNT , FILL_GRAPH_ENTRIES , SORT_GRAPH_ENTRIES , GATHER_FILL }; const elem_node_type elem_node_id ; const elem_graph_type elem_graph ; UnsignedValue row_total ; RowMapType row_count ; RowMapType row_map ; CrsGraphType graph ; vector_type residual ; sparse_matrix_type jacobian ; elem_vectors_type elem_residual ; elem_matrices_type elem_jacobian ; PhaseType phase ; public: NodeElemGatherFill() : elem_node_id() , elem_graph() , row_total() , row_count() , row_map() , graph() , residual() , jacobian() , elem_residual() , elem_jacobian() , phase( FILL_NODE_COUNT ) {} NodeElemGatherFill( const NodeElemGatherFill & rhs ) : elem_node_id( rhs.elem_node_id ) , elem_graph( rhs.elem_graph ) , row_total( rhs.row_total ) , row_count( rhs.row_count ) , row_map( rhs.row_map ) , graph( rhs.graph ) , residual( rhs.residual ) , jacobian( rhs.jacobian ) , elem_residual( rhs.elem_residual ) , elem_jacobian( rhs.elem_jacobian ) , phase( rhs.phase ) {} NodeElemGatherFill( const elem_node_type & arg_elem_node_id , const elem_graph_type & arg_elem_graph , const vector_type & arg_residual , const sparse_matrix_type & arg_jacobian , const elem_vectors_type & arg_elem_residual , const elem_matrices_type & arg_elem_jacobian ) : elem_node_id( arg_elem_node_id ) , elem_graph( arg_elem_graph ) , row_total( "row_total" ) , row_count( "row_count" , arg_residual.extent(0) ) , row_map( "graph_row_map" , arg_residual.extent(0) + 1 ) , graph() , residual( arg_residual ) , jacobian( arg_jacobian ) , elem_residual( arg_elem_residual ) , elem_jacobian( arg_elem_jacobian ) , phase( FILL_NODE_COUNT ) { //-------------------------------- // Count node->element relations phase = FILL_NODE_COUNT ; Kokkos::parallel_for( elem_node_id.extent(0) , *this ); //-------------------------------- phase = SCAN_NODE_COUNT ; // Exclusive scan of row_count into row_map // including the final total in the 'node_count + 1' position. // Zero the 'row_count' values. Kokkos::parallel_scan( residual.extent(0) , *this ); // Zero the row count for the fill: Kokkos::deep_copy( row_count , typename RowMapType::value_type(0) ); unsigned graph_entry_count = 0 ; Kokkos::deep_copy( graph_entry_count , row_total ); // Assign graph's row_map and allocate graph's entries graph.row_map = row_map ; typedef typename CrsGraphType::entries_type graph_entries_type ; graph.entries = graph_entries_type( "graph_entries" , graph_entry_count ); //-------------------------------- // Fill graph's entries from the (node,node) set. phase = FILL_GRAPH_ENTRIES ; Kokkos::deep_copy( row_count , 0u ); Kokkos::parallel_for( elem_node_id.extent(0) , *this ); execution_space().fence(); //-------------------------------- // Done with the temporary sets and arrays row_total = UnsignedValue(); row_count = RowMapType(); row_map = RowMapType(); //-------------------------------- phase = SORT_GRAPH_ENTRIES ; Kokkos::parallel_for( residual.extent(0) , *this ); execution_space().fence(); phase = GATHER_FILL ; } void apply() const { Kokkos::parallel_for( residual.extent(0) , *this ); } //------------------------------------ //------------------------------------ // parallel_for: Count node->element pairs KOKKOS_INLINE_FUNCTION void fill_node_count( const unsigned ielem ) const { for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { const unsigned row_node = elem_node_id( ielem , row_local_node ); if ( row_node < row_count.extent(0) ) { atomic_fetch_add( & row_count( row_node ) , 1 ); } } } KOKKOS_INLINE_FUNCTION void fill_graph_entries( const unsigned ielem ) const { for ( unsigned row_local_node = 0 ; row_local_node < elem_node_id.extent(1) ; ++row_local_node ) { const unsigned row_node = elem_node_id( ielem , row_local_node ); if ( row_node < row_count.extent(0) ) { const unsigned offset = graph.row_map( row_node ) + atomic_fetch_add( & row_count( row_node ) , 1 ); graph.entries( offset , 0 ) = ielem ; graph.entries( offset , 1 ) = row_local_node ; } } } KOKKOS_INLINE_FUNCTION void sort_graph_entries( const unsigned irow ) const { const unsigned row_beg = graph.row_map( irow ); const unsigned row_end = graph.row_map( irow + 1 ); for ( unsigned i = row_beg + 1 ; i < row_end ; ++i ) { const unsigned elem = graph.entries(i,0); const unsigned local = graph.entries(i,1); unsigned j = i ; for ( ; row_beg < j && elem < graph.entries(j-1,0) ; --j ) { graph.entries(j,0) = graph.entries(j-1,0); graph.entries(j,1) = graph.entries(j-1,1); } graph.entries(j,0) = elem ; graph.entries(j,1) = local ; } } //------------------------------------ KOKKOS_INLINE_FUNCTION void gather_fill( const unsigned irow ) const { const unsigned node_elem_begin = graph.row_map(irow); const unsigned node_elem_end = graph.row_map(irow+1); // for each element that a node belongs to for ( unsigned i = node_elem_begin ; i < node_elem_end ; i++ ) { const unsigned elem_id = graph.entries( i, 0); const unsigned row_index = graph.entries( i, 1); residual(irow) += elem_residual(elem_id, row_index); // for each node in a particular related element // gather the contents of the element stiffness // matrix that belong in irow for ( unsigned j = 0 ; j < ElemNodeCount ; ++j ) { const unsigned A_index = elem_graph( elem_id , row_index , j ); jacobian.coeff( A_index ) += elem_jacobian( elem_id, row_index, j ); } } } //------------------------------------ KOKKOS_INLINE_FUNCTION void operator()( const unsigned iwork ) const { if ( phase == FILL_NODE_COUNT ) { fill_node_count( iwork ); } else if ( phase == FILL_GRAPH_ENTRIES ) { fill_graph_entries( iwork ); } else if ( phase == SORT_GRAPH_ENTRIES ) { sort_graph_entries( iwork ); } else if ( phase == GATHER_FILL ) { gather_fill( iwork ); } } //------------------------------------ // parallel_scan: row offsets typedef unsigned value_type ; KOKKOS_INLINE_FUNCTION void operator()( const unsigned irow , unsigned & update , const bool final ) const { // exclusive scan if ( final ) { row_map( irow ) = update ; } update += row_count( irow ); if ( final ) { if ( irow + 1 == row_count.extent(0) ) { row_map( irow + 1 ) = update ; row_total() = update ; } } } KOKKOS_INLINE_FUNCTION void init( unsigned & update ) const { update = 0 ; } KOKKOS_INLINE_FUNCTION void join( volatile unsigned & update , const volatile unsigned & input ) const { update += input ; } }; } /* namespace FENL */ } /* namespace Example */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- namespace Kokkos { namespace Example { namespace FENL { template< class FiniteElementMeshType , class SparseMatrixType > class ElementComputation ; template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType > class ElementComputation< Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > , Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > > { public: typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap > mesh_type ; typedef Kokkos::Example::HexElement_Data< mesh_type::ElemNode > element_data_type ; typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > sparse_matrix_type ; typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type ; typedef ExecSpace execution_space ; typedef ScalarType scalar_type ; static const unsigned SpatialDim = element_data_type::spatial_dimension ; static const unsigned TensorDim = SpatialDim * SpatialDim ; static const unsigned ElemNodeCount = element_data_type::element_node_count ; static const unsigned FunctionCount = element_data_type::function_count ; static const unsigned IntegrationCount = element_data_type::integration_count ; //------------------------------------ typedef typename mesh_type::node_coord_type node_coord_type ; typedef typename mesh_type::elem_node_type elem_node_type ; typedef Kokkos::View< scalar_type*[FunctionCount][FunctionCount] , execution_space > elem_matrices_type ; typedef Kokkos::View< scalar_type*[FunctionCount] , execution_space > elem_vectors_type ; typedef Kokkos::View< scalar_type* , execution_space > vector_type ; typedef typename NodeNodeGraph< elem_node_type , sparse_graph_type , ElemNodeCount >::ElemGraphType elem_graph_type ; //------------------------------------ //------------------------------------ // Computational data: const element_data_type elem_data ; const elem_node_type elem_node_ids ; const node_coord_type node_coords ; const elem_graph_type elem_graph ; const elem_matrices_type elem_jacobians ; const elem_vectors_type elem_residuals ; const vector_type solution ; const vector_type residual ; const sparse_matrix_type jacobian ; const scalar_type coeff_K ; ElementComputation( const ElementComputation & rhs ) : elem_data() , elem_node_ids( rhs.elem_node_ids ) , node_coords( rhs.node_coords ) , elem_graph( rhs.elem_graph ) , elem_jacobians( rhs.elem_jacobians ) , elem_residuals( rhs.elem_residuals ) , solution( rhs.solution ) , residual( rhs.residual ) , jacobian( rhs.jacobian ) , coeff_K( rhs.coeff_K ) {} // If the element->sparse_matrix graph is provided then perform atomic updates // Otherwise fill per-element contributions for subequent gather-add into a residual and jacobian. ElementComputation( const mesh_type & arg_mesh , const scalar_type arg_coeff_K , const vector_type & arg_solution , const elem_graph_type & arg_elem_graph , const sparse_matrix_type & arg_jacobian , const vector_type & arg_residual ) : elem_data() , elem_node_ids( arg_mesh.elem_node() ) , node_coords( arg_mesh.node_coord() ) , elem_graph( arg_elem_graph ) , elem_jacobians() , elem_residuals() , solution( arg_solution ) , residual( arg_residual ) , jacobian( arg_jacobian ) , coeff_K( arg_coeff_K ) {} ElementComputation( const mesh_type & arg_mesh , const scalar_type arg_coeff_K , const vector_type & arg_solution ) : elem_data() , elem_node_ids( arg_mesh.elem_node() ) , node_coords( arg_mesh.node_coord() ) , elem_graph() , elem_jacobians( "elem_jacobians" , arg_mesh.elem_count() ) , elem_residuals( "elem_residuals" , arg_mesh.elem_count() ) , solution( arg_solution ) , residual() , jacobian() , coeff_K( arg_coeff_K ) {} //------------------------------------ void apply() const { parallel_for( elem_node_ids.extent(0) , *this ); } //------------------------------------ static const unsigned FLOPS_transform_gradients = /* Jacobian */ FunctionCount * TensorDim * 2 + /* Inverse jacobian */ TensorDim * 6 + 6 + /* Gradient transform */ FunctionCount * 15 ; KOKKOS_INLINE_FUNCTION float transform_gradients( const float grad[][ FunctionCount ] , // Gradient of bases master element const double x[] , const double y[] , const double z[] , float dpsidx[] , float dpsidy[] , float dpsidz[] ) const { enum { j11 = 0 , j12 = 1 , j13 = 2 , j21 = 3 , j22 = 4 , j23 = 5 , j31 = 6 , j32 = 7 , j33 = 8 }; // Jacobian accumulation: double J[ TensorDim ] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }; for( unsigned i = 0; i < FunctionCount ; ++i ) { const double x1 = x[i] ; const double x2 = y[i] ; const double x3 = z[i] ; const float g1 = grad[0][i] ; const float g2 = grad[1][i] ; const float g3 = grad[2][i] ; J[j11] += g1 * x1 ; J[j12] += g1 * x2 ; J[j13] += g1 * x3 ; J[j21] += g2 * x1 ; J[j22] += g2 * x2 ; J[j23] += g2 * x3 ; J[j31] += g3 * x1 ; J[j32] += g3 * x2 ; J[j33] += g3 * x3 ; } // Inverse jacobian: float invJ[ TensorDim ] = { static_cast( J[j22] * J[j33] - J[j23] * J[j32] ) , static_cast( J[j13] * J[j32] - J[j12] * J[j33] ) , static_cast( J[j12] * J[j23] - J[j13] * J[j22] ) , static_cast( J[j23] * J[j31] - J[j21] * J[j33] ) , static_cast( J[j11] * J[j33] - J[j13] * J[j31] ) , static_cast( J[j13] * J[j21] - J[j11] * J[j23] ) , static_cast( J[j21] * J[j32] - J[j22] * J[j31] ) , static_cast( J[j12] * J[j31] - J[j11] * J[j32] ) , static_cast( J[j11] * J[j22] - J[j12] * J[j21] ) }; const float detJ = J[j11] * invJ[j11] + J[j21] * invJ[j12] + J[j31] * invJ[j13] ; const float detJinv = 1.0 / detJ ; for ( unsigned i = 0 ; i < TensorDim ; ++i ) { invJ[i] *= detJinv ; } // Transform gradients: for( unsigned i = 0; i < FunctionCount ; ++i ) { const float g0 = grad[0][i]; const float g1 = grad[1][i]; const float g2 = grad[2][i]; dpsidx[i] = g0 * invJ[j11] + g1 * invJ[j12] + g2 * invJ[j13]; dpsidy[i] = g0 * invJ[j21] + g1 * invJ[j22] + g2 * invJ[j23]; dpsidz[i] = g0 * invJ[j31] + g1 * invJ[j32] + g2 * invJ[j33]; } return detJ ; } KOKKOS_INLINE_FUNCTION void contributeResidualJacobian( const float coeff_k , const double dof_values[] , const float dpsidx[] , const float dpsidy[] , const float dpsidz[] , const float detJ , const float integ_weight , const float bases_vals[] , double elem_res[] , double elem_mat[][ FunctionCount ] ) const { double value_at_pt = 0 ; double gradx_at_pt = 0 ; double grady_at_pt = 0 ; double gradz_at_pt = 0 ; for ( unsigned m = 0 ; m < FunctionCount ; m++ ) { value_at_pt += dof_values[m] * bases_vals[m] ; gradx_at_pt += dof_values[m] * dpsidx[m] ; grady_at_pt += dof_values[m] * dpsidy[m] ; gradz_at_pt += dof_values[m] * dpsidz[m] ; } const scalar_type k_detJ_weight = coeff_k * detJ * integ_weight ; const double res_val = value_at_pt * value_at_pt * detJ * integ_weight ; const double mat_val = 2.0 * value_at_pt * detJ * integ_weight ; // $$ R_i = \int_{\Omega} \nabla \phi_i \cdot (k \nabla T) + \phi_i T^2 d \Omega $$ // $$ J_{i,j} = \frac{\partial R_i}{\partial T_j} = \int_{\Omega} k \nabla \phi_i \cdot \nabla \phi_j + 2 \phi_i \phi_j T d \Omega $$ for ( unsigned m = 0; m < FunctionCount; ++m) { double * const mat = elem_mat[m] ; const float bases_val_m = bases_vals[m]; const float dpsidx_m = dpsidx[m] ; const float dpsidy_m = dpsidy[m] ; const float dpsidz_m = dpsidz[m] ; elem_res[m] += k_detJ_weight * ( dpsidx_m * gradx_at_pt + dpsidy_m * grady_at_pt + dpsidz_m * gradz_at_pt ) + res_val * bases_val_m ; for( unsigned n = 0; n < FunctionCount; n++) { mat[n] += k_detJ_weight * ( dpsidx_m * dpsidx[n] + dpsidy_m * dpsidy[n] + dpsidz_m * dpsidz[n] ) + mat_val * bases_val_m * bases_vals[n]; } } } KOKKOS_INLINE_FUNCTION void operator()( const unsigned ielem ) const { // Gather nodal coordinates and solution vector: double x[ FunctionCount ] ; double y[ FunctionCount ] ; double z[ FunctionCount ] ; double val[ FunctionCount ] ; unsigned node_index[ ElemNodeCount ]; for ( unsigned i = 0 ; i < ElemNodeCount ; ++i ) { const unsigned ni = elem_node_ids( ielem , i ); node_index[i] = ni ; x[i] = node_coords( ni , 0 ); y[i] = node_coords( ni , 1 ); z[i] = node_coords( ni , 2 ); val[i] = solution( ni ); } double elem_vec[ FunctionCount ] ; double elem_mat[ FunctionCount ][ FunctionCount ] ; for( unsigned i = 0; i < FunctionCount ; i++ ) { elem_vec[i] = 0 ; for( unsigned j = 0; j < FunctionCount ; j++){ elem_mat[i][j] = 0 ; } } for ( unsigned i = 0 ; i < IntegrationCount ; ++i ) { float dpsidx[ FunctionCount ] ; float dpsidy[ FunctionCount ] ; float dpsidz[ FunctionCount ] ; const float detJ = transform_gradients( elem_data.gradients[i] , x , y , z , dpsidx , dpsidy , dpsidz ); contributeResidualJacobian( coeff_K , val , dpsidx , dpsidy , dpsidz , detJ , elem_data.weights[i] , elem_data.values[i] , elem_vec , elem_mat ); } #if 0 if ( 1 == ielem ) { printf("ElemResidual { %f %f %f %f %f %f %f %f }\n", elem_vec[0], elem_vec[1], elem_vec[2], elem_vec[3], elem_vec[4], elem_vec[5], elem_vec[6], elem_vec[7]); printf("ElemJacobian {\n"); for ( unsigned j = 0 ; j < FunctionCount ; ++j ) { printf(" { %f %f %f %f %f %f %f %f }\n", elem_mat[j][0], elem_mat[j][1], elem_mat[j][2], elem_mat[j][3], elem_mat[j][4], elem_mat[j][5], elem_mat[j][6], elem_mat[j][7]); } printf("}\n"); } #endif if ( ! residual.extent(0) ) { for( unsigned i = 0; i < FunctionCount ; i++){ elem_residuals(ielem, i) = elem_vec[i] ; for( unsigned j = 0; j < FunctionCount ; j++){ elem_jacobians(ielem, i, j) = elem_mat[i][j] ; } } } else { for( unsigned i = 0 ; i < FunctionCount ; i++ ) { const unsigned row = node_index[i] ; if ( row < residual.extent(0) ) { atomic_fetch_add( & residual( row ) , elem_vec[i] ); for( unsigned j = 0 ; j < FunctionCount ; j++ ) { const unsigned entry = elem_graph( ielem , i , j ); if ( entry != ~0u ) { atomic_fetch_add( & jacobian.coeff( entry ) , elem_mat[i][j] ); } } } } } } }; /* ElementComputation */ //---------------------------------------------------------------------------- template< class FixtureType , class SparseMatrixType > class DirichletComputation ; template< class ExecSpace , BoxElemPart::ElemOrder Order , class CoordinateMap , typename ScalarType > class DirichletComputation< Kokkos::Example::BoxElemFixture< ExecSpace , Order , CoordinateMap > , Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > > { public: typedef Kokkos::Example::BoxElemFixture< ExecSpace, Order, CoordinateMap > mesh_type ; typedef typename mesh_type::node_coord_type node_coord_type ; typedef typename node_coord_type::value_type scalar_coord_type ; typedef Kokkos::Example::CrsMatrix< ScalarType , ExecSpace > sparse_matrix_type ; typedef typename sparse_matrix_type::StaticCrsGraphType sparse_graph_type ; typedef ExecSpace execution_space ; typedef ScalarType scalar_type ; //------------------------------------ typedef Kokkos::View< scalar_type* , execution_space > vector_type ; //------------------------------------ // Computational data: const node_coord_type node_coords ; const vector_type solution ; const sparse_matrix_type jacobian ; const vector_type residual ; const scalar_type bc_lower_value ; const scalar_type bc_upper_value ; const scalar_coord_type bc_lower_limit ; const scalar_coord_type bc_upper_limit ; const unsigned bc_plane ; const unsigned node_count ; bool init ; DirichletComputation( const mesh_type & arg_mesh , const vector_type & arg_solution , const sparse_matrix_type & arg_jacobian , const vector_type & arg_residual , const unsigned arg_bc_plane , const scalar_type arg_bc_lower_value , const scalar_type arg_bc_upper_value ) : node_coords( arg_mesh.node_coord() ) , solution( arg_solution ) , jacobian( arg_jacobian ) , residual( arg_residual ) , bc_lower_value( arg_bc_lower_value ) , bc_upper_value( arg_bc_upper_value ) , bc_lower_limit( std::numeric_limits::epsilon() ) , bc_upper_limit( scalar_coord_type(1) - std::numeric_limits::epsilon() ) , bc_plane( arg_bc_plane ) , node_count( arg_mesh.node_count_owned() ) , init( false ) { parallel_for( node_count , *this ); init = true ; } void apply() const { parallel_for( node_count , *this ); } //------------------------------------ KOKKOS_INLINE_FUNCTION void operator()( const unsigned inode ) const { // Apply dirichlet boundary condition on the Solution and Residual vectors. // To maintain the symmetry of the original global stiffness matrix, // zero out the columns that correspond to boundary conditions, and // update the residual vector accordingly const unsigned iBeg = jacobian.graph.row_map[inode]; const unsigned iEnd = jacobian.graph.row_map[inode+1]; const scalar_coord_type c = node_coords(inode,bc_plane); const bool bc_lower = c <= bc_lower_limit ; const bool bc_upper = bc_upper_limit <= c ; if ( ! init ) { solution(inode) = bc_lower ? bc_lower_value : ( bc_upper ? bc_upper_value : 0 ); } else { if ( bc_lower || bc_upper ) { residual(inode) = 0 ; // zero each value on the row, and leave a one // on the diagonal for( unsigned i = iBeg ; i < iEnd ; ++i ) { jacobian.coeff(i) = int(inode) == int(jacobian.graph.entries(i)) ? 1 : 0 ; } } else { // Find any columns that are boundary conditions. // Clear them and adjust the residual vector for( unsigned i = iBeg ; i < iEnd ; ++i ) { const unsigned cnode = jacobian.graph.entries(i) ; const scalar_coord_type cc = node_coords(cnode,bc_plane); if ( ( cc <= bc_lower_limit ) || ( bc_upper_limit <= cc ) ) { jacobian.coeff(i) = 0 ; } } } } } }; } /* namespace FENL */ } /* namespace Example */ } /* namespace Kokkos */ //---------------------------------------------------------------------------- /* A Cuda-specific specialization for the element computation functor. */ #if defined( __CUDACC__ ) // #include #endif //---------------------------------------------------------------------------- #endif /* #ifndef KOKKOS_EXAMPLE_FENLFUNCTORS_HPP */