use tensorgraph_sys::{
    device::cuda::{Context, SharedStream, Stream},
    Vec, View,
};

use tensorgraph_math::{
    blas::cublas::{CublasContext, SharedCublasContext},
    tensor::Tensor,
};

fn main() {
    // init cuda context
    let cuda_ctx = Context::quick_init().unwrap();

    // create cuda stream
    let stream = Stream::new(&cuda_ctx).unwrap();

    // create cublas context, with the provided stream
    let cublas_ctx = CublasContext::new();
    let cublas_ctx = cublas_ctx.with_stream(Some(&stream));

    run(cublas_ctx, &stream);
}

fn run(ctx: &SharedCublasContext, alloc: &SharedStream) {
    //     0 1
    // A = 2 3
    //     4 5

    // B = 0 1
    //     2 3

    // column major (read each column first)
    let a = Vec::copy_from_host_in(&[0., 2., 4., 1., 3., 5.0_f64], alloc);
    let b = Vec::copy_from_host_in(&[0., 2., 1., 3.], alloc);

    let a = Tensor::from_shape([3, 2], a); // 3 rows x 2 cols
    let b = Tensor::from_shape([2, 2], b); // 2 rows x 2 cols

    //           2  3
    // C = AB =  6 11
    //          10 19

    let c = a.matmul_into(b.view(), ctx, alloc);

    let mut out = [0.; 6];
    c.into_inner().copy_to_host(&mut out);
    assert_eq!(out, [2., 6., 10., 3., 11., 19.]);
}