/* BLIS An object-based framework for developing high-performance BLAS-like libraries. Copyright (C) 2014, The University of Texas at Austin Copyright (C) 2018, Advanced Micro Devices, Inc. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name(s) of the copyright holder(s) nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "blis.h" void bli_packm_sup_init_mem ( bool will_pack, packbuf_t pack_buf_type, num_t dt, dim_t m, dim_t k, dim_t mr, thrinfo_t* thread ) { // Inspect whether we are going to be packing matrix A. if ( will_pack == FALSE ) { } else // if ( will_pack == TRUE ) { mem_t* mem = bli_thrinfo_mem( thread ); pba_t* pba = bli_thrinfo_pba( thread ); // NOTE: This "rounding up" of the last upanel is actually optional // for the rrc/crc cases, but absolutely necessary for the other cases // since we NEED that last micropanel to have the same ldim (cs_p) as // the other micropanels. Why? So that millikernels can use the same // upanel ldim for all iterations of the ir loop. const dim_t m_pack = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; const dim_t k_pack = k; // Barrier to make sure all threads are caught up and ready to begin // the packm stage. bli_thrinfo_barrier( thread ); // Compute the size of the memory block eneded. siz_t size_needed = bli_dt_size( dt ) * m_pack * k_pack; // Check the mem_t entry provided by the caller. If it is unallocated, // then we need to acquire a block from the pba. if ( bli_mem_is_unalloc( mem ) ) { if ( bli_thrinfo_am_chief( thread ) ) { // Acquire directly to the chief thread's mem_t that was // passed in. It needs to be that mem_t struct, and not a // local (temporary) mem_t, since there is no barrier until // after packing is finished, which could allow a race // condition whereby the chief thread exits the current // function before the other threads have a chance to copy // from it. (A barrier would fix that race condition, but // then again, I prefer to keep barriers to a minimum.) bli_pba_acquire_m ( pba, size_needed, pack_buf_type, mem ); } // Broadcast the address of the chief thread's passed-in mem_t // to all threads. mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The // chief thread already has the mem_t, so it does not need to // perform any copy.) if ( !bli_thrinfo_am_chief( thread ) ) { *mem = *mem_p; } } else // if ( bli_mem_is_alloc( mem ) ) { // If the mem_t entry provided by the caller does NOT contain a NULL // buffer, then a block has already been acquired from the pba and // cached by the caller. // As a sanity check, we should make sure that the mem_t object isn't // associated with a block that is too small compared to the size of // the packed matrix buffer that is needed, according to the value // computed above. siz_t mem_size = bli_mem_size( mem ); if ( mem_size < size_needed ) { if ( bli_thrinfo_am_chief( thread ) ) { // The chief thread releases the existing block associated // with the mem_t, and then re-acquires a new block, saving // the associated mem_t to its passed-in mem_t. (See coment // above for why the acquisition needs to be directly to // the chief thread's passed-in mem_t and not a local // (temporary) mem_t. bli_pba_release ( pba, mem ); bli_pba_acquire_m ( pba, size_needed, pack_buf_type, mem ); } // Broadcast the address of the chief thread's passed-in mem_t // to all threads. mem_t* mem_p = bli_thrinfo_broadcast( thread, mem ); // Non-chief threads: Copy the contents of the chief thread's // passed-in mem_t to the passed-in mem_t for this thread. (The // chief thread already has the mem_t, so it does not need to // perform any copy.) if ( !bli_thrinfo_am_chief( thread ) ) { *mem = *mem_p; } } else { // If the mem_t entry is already allocated and sufficiently large, // then we use it as-is. No action is needed. } } } } void bli_packm_sup_finalize_mem ( bool did_pack, thrinfo_t* thread ) { // Inspect whether we previously packed matrix A. if ( did_pack == FALSE ) { // If we didn't pack matrix A, there's nothing to be done. } else // if ( did_pack == TRUE ) { mem_t* mem = bli_thrinfo_mem( thread ); pba_t* pba = bli_thrinfo_pba( thread ); if ( thread != NULL ) if ( bli_thrinfo_am_chief( thread ) ) { // Check the mem_t entry provided by the caller. Only proceed if it // is allocated, which it should be. if ( bli_mem_is_alloc( mem ) ) { bli_pba_release ( pba, mem ); } } } } void bli_packm_sup_init ( bool will_pack, stor3_t stor_id, pack_t* schema, dim_t m, dim_t k, dim_t mr, dim_t* m_max, dim_t* k_max, const void* x, inc_t rs_x, inc_t cs_x, void** p, inc_t* rs_p, inc_t* cs_p, dim_t* pd_p, inc_t* ps_p, thrinfo_t* thread ) { // Inspect whether we are going to be packing matrix A. if ( will_pack == FALSE ) { *m_max = m; *k_max = k; // Set the parameters for use with no packing of A (ie: using the // source matrix A directly). { // Use the strides of the source matrix as the final values. *rs_p = rs_x; *cs_p = cs_x; *pd_p = mr; *ps_p = mr * rs_x; // Set the schema to "not packed" to indicate that packing will be // skipped. *schema = BLIS_NOT_PACKED; } // Since we won't be packing, simply update the buffer address provided // by the caller to point to source matrix. *p = ( void* )x; } else // if ( will_pack == TRUE ) { // NOTE: This is "rounding up" of the last upanel is actually optional // for the rrc/crc cases, but absolutely necessary for the other cases // since we NEED that last micropanel to have the same ldim (cs_p) as // the other micropanels. Why? So that millikernels can use the same // upanel ldim for all iterations of the ir loop. *m_max = ( m / mr + ( m % mr ? 1 : 0 ) ) * mr; *k_max = k; // Determine the dimensions and strides for the packed matrix A. if ( stor_id == BLIS_RRC || stor_id == BLIS_CRC ) { // stor3_t id values _RRC and _CRC: pack A to plain row storage. *rs_p = k; *cs_p = 1; *pd_p = mr; *ps_p = mr * k; // Set the schema to "row packed" to indicate packing to plain // row storage. *schema = BLIS_PACKED_ROWS; } else { // All other stor3_t ids: pack A to column-stored row-panels. *rs_p = 1; *cs_p = mr; *pd_p = mr; *ps_p = mr * k; // Set the schema to "packed row panels" to indicate packing to // conventional column-stored row panels. *schema = BLIS_PACKED_ROW_PANELS; } // Set the buffer address provided by the caller to point to the // memory associated with the mem_t entry acquired from the pba. *p = bli_mem_buffer( bli_thrinfo_mem( thread ) ); } } typedef void (*packm_sup_var1_fp) ( trans_t transc, pack_t schema, dim_t m, dim_t n, dim_t m_max, dim_t n_max, void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, dim_t pd_p, inc_t ps_p, cntx_t* cntx, thrinfo_t* thread ); typedef void (*packm_sup_var2_fp) ( trans_t transc, pack_t schema, dim_t m, dim_t n, void* kappa, void* c, inc_t rs_c, inc_t cs_c, void* p, inc_t rs_p, inc_t cs_p, cntx_t* cntx, thrinfo_t* thread ); static packm_sup_var1_fp GENARRAY(packm_sup_var1,packm_sup_var1); static packm_sup_var2_fp GENARRAY(packm_sup_var2,packm_sup_var2); // // Define BLAS-like interfaces to the variant chooser. // void bli_packm_sup ( bool will_pack, packbuf_t pack_buf_type, stor3_t stor_id, trans_t transc, num_t dt, dim_t m_alloc, dim_t k_alloc, dim_t m, dim_t k, dim_t mr, const void* kappa, const void* a, inc_t rs_a, inc_t cs_a, void** p, inc_t* rs_p, inc_t* cs_p, inc_t* ps_p, const cntx_t* cntx, thrinfo_t* thread ) { pack_t schema; dim_t m_max; dim_t k_max; dim_t pd_p; // Prepare the packing destination buffer. If packing is not requested, // this function will reduce to a no-op. bli_packm_sup_init_mem ( will_pack, pack_buf_type, dt, m_alloc, k_alloc, mr, thread ); // Determine the packing buffer and related parameters for matrix A. If A // will not be packed, then a_use will be set to point to a and the _a_use // strides will be set accordingly. bli_packm_sup_init ( will_pack, stor_id, &schema, m, k, mr, &m_max, &k_max, a, rs_a, cs_a, p, rs_p, cs_p, &pd_p, ps_p, thread ); // Inspect whether we are going to be packing matrix A. if ( will_pack == FALSE ) { // If we aren't going to pack matrix A, then there's nothing to do. // printf( "blis_ packm_sup_a: not packing A.\n" ); } else // if ( will_pack == TRUE ) { if ( schema == BLIS_PACKED_ROWS ) { // printf( "blis_ packm_sup_a: packing A to rows.\n" ); // For plain packing by rows, use var2. packm_sup_var2[ dt ] ( transc, schema, m, k, ( void* )kappa, ( void* )a, rs_a, cs_a, *p, *rs_p, *cs_p, ( cntx_t* )cntx, bli_thrinfo_sub_prenode( thread ) ); } else // if ( schema == BLIS_PACKED_ROW_PANELS ) { // printf( "blis_ packm_sup_a: packing A to row panels.\n" ); // For packing to column-stored row panels, use var1. packm_sup_var1[ dt ] ( transc, schema, m, k, m_max, k_max, ( void* )kappa, ( void* )a, rs_a, cs_a, *p, *rs_p, *cs_p, pd_p, *ps_p, ( cntx_t* )cntx, bli_thrinfo_sub_prenode( thread ) ); } // Barrier so that packing is done before computation. bli_thrinfo_barrier( thread ); } }