/*

   BLIS
   An object-based framework for developing high-performance BLAS-like
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
   Copyright (C) 2018 - 2019, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
   met:
    - Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    - Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    - Neither the name(s) of the copyright holder(s) nor the names of its
      contributors may be used to endorse or promote products derived
      from this software without specific prior written permission.

   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

*/

#include "blis.h"


static func_t packm_struc_cxk_kers[BLIS_NUM_PACK_SCHEMA_TYPES] =
{
    /* float (0)  scomplex (1)  double (2)  dcomplex (3) */
// 0000 row/col panels
    { { bli_spackm_struc_cxk,      bli_cpackm_struc_cxk,
        bli_dpackm_struc_cxk,      bli_zpackm_struc_cxk,      } },
// 0001 row/col panels: 1m-expanded (1e)
    { { NULL,                      bli_cpackm_struc_cxk,
        NULL,                      bli_zpackm_struc_cxk,  } },
// 0010 row/col panels: 1m-reordered (1r)
    { { NULL,                      bli_cpackm_struc_cxk,
        NULL,                      bli_zpackm_struc_cxk,  } },
};

static void_fp GENARRAY2_ALL(packm_struc_cxk_md,packm_struc_cxk_md);

void bli_packm_blk_var1
     (
       const obj_t*     c,
             obj_t*     p,
       const cntx_t*    cntx,
       const cntl_t*    cntl,
             thrinfo_t* thread_par
     )
{
	// Extract various fields from the control tree.
	pack_t schema  = bli_cntl_packm_params_pack_schema( cntl );
	bool   invdiag = bli_cntl_packm_params_does_invert_diag( cntl );
	bool   revifup = bli_cntl_packm_params_rev_iter_if_upper( cntl );
	bool   reviflo = bli_cntl_packm_params_rev_iter_if_lower( cntl );

	// Every thread initializes p and determines the size of memory block
	// needed (which gets embedded into the otherwise "blank" mem_t entry
	// in the control tree node). Return early if no packing is required.
	if ( !bli_packm_init( c, p, cntx, cntl, bli_thrinfo_sub_node( thread_par ) ) )
		return;

	// Use the sub-prenode. In bli_l3_thrinfo_grow(), this node was created to
	// represent the team of threads as a group of single-member thread teams.
	// This is necessary since the all of the work distribution function depend
	// on the work_id and n_way fields.
	thrinfo_t* thread = bli_thrinfo_sub_prenode( thread_par );

	// Check parameters.
	if ( bli_error_checking_is_enabled() )
		bli_packm_int_check( c, p, cntx );

	num_t   dt_c           = bli_obj_dt( c );
	dim_t   dt_c_size      = bli_dt_size( dt_c );

	num_t   dt_p           = bli_obj_dt( p );
	dim_t   dt_p_size      = bli_dt_size( dt_p );

	struc_t strucc         = bli_obj_struc( c );
	doff_t  diagoffc       = bli_obj_diag_offset( c );
	diag_t  diagc          = bli_obj_diag( c );
	uplo_t  uploc          = bli_obj_uplo( c );
	conj_t  conjc          = bli_obj_conj_status( c );

	dim_t   iter_dim       = bli_obj_length( p );
	dim_t   panel_len_full = bli_obj_width( p );
	dim_t   panel_len_max  = bli_obj_padded_width( p );

	char*   c_cast         = bli_obj_buffer_at_off( c );
	inc_t   incc           = bli_obj_row_stride( c );
	inc_t   ldc            = bli_obj_col_stride( c );
	dim_t   panel_dim_off  = bli_obj_row_off( c );
	dim_t   panel_len_off  = bli_obj_col_off( c );

	char*   p_cast         = bli_obj_buffer( p );
	inc_t   ldp            = bli_obj_col_stride( p );
	inc_t   is_p           = bli_obj_imag_stride( p );
	dim_t   panel_dim_max  = bli_obj_panel_dim( p );
	inc_t   ps_p           = bli_obj_panel_stride( p );

	doff_t  diagoffc_inc   = ( doff_t )panel_dim_max;

	obj_t   kappa_local;
	char*   kappa_cast     = bli_packm_scalar( &kappa_local, p );

	// we use the default lookup table to determine the right func_t
	// for the current schema.
	func_t* packm_kers = &packm_struc_cxk_kers[ bli_pack_schema_index( schema ) ];

	// Query the datatype-specific function pointer from the func_t object.
	packm_ker_ft packm_ker_cast = bli_func_get_dt( dt_p, packm_kers );

	// For mixed-precision gemm, select the proper kernel (only dense panels).
	if ( dt_c != dt_p )
	{
		packm_ker_cast = packm_struc_cxk_md[ dt_c ][ dt_p ];
	}

	// Query the address of the packm params field of the obj_t. The user might
	// have set this field in order to specify a custom packm kernel.
	packm_blk_var1_params_t* params = bli_obj_pack_params( c );

	if ( params && params->ukr_fn[ dt_c ][ dt_p ] )
	{
		// Query the user-provided packing kernel from the obj_t. If provided,
		// this overrides the kernel determined above.
		packm_ker_cast = params->ukr_fn[ dt_c ][ dt_p ];
	}

	// Compute the total number of iterations we'll need.
	dim_t n_iter = iter_dim / panel_dim_max + ( iter_dim % panel_dim_max ? 1 : 0 );

	// Set the initial values and increments for indices related to C and P
	// based on whether reverse iteration was requested.
	dim_t  ic0, ip0;
	doff_t ic_inc, ip_inc;

	if ( ( revifup && bli_is_upper( uploc ) && bli_is_triangular( strucc ) ) ||
	     ( reviflo && bli_is_lower( uploc ) && bli_is_triangular( strucc ) ) )
	{
		ic0    = (n_iter - 1) * panel_dim_max;
		ic_inc = -panel_dim_max;
		ip0    = n_iter - 1;
		ip_inc = -1;
	}
	else
	{
		ic0    = 0;
		ic_inc = panel_dim_max;
		ip0    = 0;
		ip_inc = 1;
	}

	// Query the number of threads (single-member thread teams) and the thread
	// team ids from the current thread's packm thrinfo_t node.
	const dim_t nt  = bli_thrinfo_n_way( thread );
	const dim_t tid = bli_thrinfo_work_id( thread );

	// Determine the thread range and increment using the current thread's
	// packm thrinfo_t node. NOTE: The definition of bli_thread_range_slrr()
	// will depend on whether slab or round-robin partitioning was requested
	// at configure-time.
	dim_t it_start, it_end, it_inc;
	bli_thread_range_slrr( thread, n_iter, 1, FALSE, &it_start, &it_end, &it_inc );

	char* p_begin = p_cast;

	if ( !bli_is_triangular( strucc ) ||
	     bli_is_stored_subpart_n( diagoffc, uploc, iter_dim, panel_len_full ) )
	{
		// This case executes if the panel is either dense, belongs
		// to a Hermitian or symmetric matrix, which includes stored,
		// unstored, and diagonal-intersecting panels, or belongs
		// to a completely stored part of a triangular matrix.

		// Iterate over every logical micropanel in the source matrix.
		for ( dim_t ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter;
		            ic += ic_inc, ip += ip_inc, it += 1 )
		{
			dim_t  panel_dim_i     = bli_min( panel_dim_max, iter_dim - ic );
			dim_t  panel_dim_off_i = panel_dim_off + ic;

			char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;

			// Hermitian/symmetric and general packing may use slab or round-
			// robin (bli_is_my_iter()), depending on which was selected at
			// configure-time.
			if ( bli_is_my_iter( it, it_start, it_end, tid, nt ) )
			{
				packm_ker_cast
				(
				  bli_is_triangular( strucc ) ? BLIS_GENERAL : strucc,
				  diagc,
				  uploc,
				  conjc,
				  schema,
				  invdiag,
				  panel_dim_i,
				  panel_len_full,
				  panel_dim_max,
				  panel_len_max,
				  panel_dim_off_i,
				  panel_len_off,
				  kappa_cast,
				  c_begin, incc, ldc,
				  p_begin,       ldp, is_p,
				  params,
				  ( cntx_t* )cntx
				);
			}

			p_begin += ps_p*dt_p_size;
		}
	}
	else
	{
		// This case executes if the panel belongs to a diagonal-intersecting
		// part of a triangular matrix.

		// Iterate over every logical micropanel in the source matrix.
		for ( dim_t ic  = ic0,    ip  = ip0,    it  = 0; it < n_iter;
		            ic += ic_inc, ip += ip_inc, it += 1 )
		{
			dim_t  panel_dim_i     = bli_min( panel_dim_max, iter_dim - ic );
			dim_t  panel_dim_off_i = panel_dim_off + ic;

			doff_t diagoffc_i      = diagoffc + (ip  )*diagoffc_inc;
			char*  c_begin         = c_cast   + (ic  )*incc*dt_c_size;

			if ( bli_is_unstored_subpart_n( diagoffc_i, uploc, panel_dim_i,
			                                panel_len_full ) )
				continue;

			// Sanity check. Diagonals should not intersect the short edge of
			// a micro-panel (typically corresponding to a register blocksize).
			// If they do, then the constraints on cache blocksizes being a
			// whole multiple of the register blocksizes was somehow violated.
			if ( ( diagoffc_i > -panel_dim_i &&
			       diagoffc_i < 0 ) ||
			     ( diagoffc_i > panel_len_full &&
			       diagoffc_i < panel_len_full + panel_dim_i ) )
				bli_check_error_code( BLIS_NOT_YET_IMPLEMENTED );

			dim_t panel_off_i     = 0;
			dim_t panel_len_i     = panel_len_full;
			dim_t panel_len_max_i = panel_len_max;

			if ( bli_intersects_diag_n( diagoffc_i, panel_dim_i, panel_len_full ) )
			{
				if ( bli_is_lower( uploc ) )
				{
					panel_off_i     = 0;
					panel_len_i     = diagoffc_i + panel_dim_i;
					panel_len_max_i = bli_min( diagoffc_i + panel_dim_max,
					                           panel_len_max );
				}
				else // if ( bli_is_upper( uploc ) )
				{
					panel_off_i     = diagoffc_i;
					panel_len_i     = panel_len_full - panel_off_i;
					panel_len_max_i = panel_len_max  - panel_off_i;
				}
			}

			dim_t panel_len_off_i = panel_off_i + panel_len_off;

			char* c_use           = c_begin + (panel_off_i  )*ldc*dt_c_size;
			char* p_use           = p_begin;

			// We need to re-compute the imaginary stride as a function of
			// panel_len_max_i since triangular packed matrices have panels
			// of varying lengths. NOTE: This imaginary stride value is
			// only referenced by the packm kernels for induced methods.
			inc_t is_p_use = ldp * panel_len_max_i;

			// We nudge the imaginary stride up by one if it is odd.
			is_p_use += ( bli_is_odd( is_p_use ) ? 1 : 0 );

			// NOTE: We MUST use round-robin work allocation (bli_is_my_iter_rr())
			// when packing micropanels of a triangular matrix.
			if ( bli_is_my_iter_rr( it, tid, nt ) )
			{
				packm_ker_cast
				(
				  strucc,
				  diagc,
				  uploc,
				  conjc,
				  schema,
				  invdiag,
				  panel_dim_i,
				  panel_len_i,
				  panel_dim_max,
				  panel_len_max_i,
				  panel_dim_off_i,
				  panel_len_off_i,
				  kappa_cast,
				  c_use, incc, ldc,
				  p_use,       ldp,
				         is_p_use,
				  params,
				  ( cntx_t* )cntx
				);
			}

			// NOTE: This value is usually LESS than ps_p because triangular
			// matrices usually have several micro-panels that are shorter
			// than a "full" micro-panel.
			p_begin += is_p_use*dt_p_size;
		}
	}
}