use custos::{range, OpenCL};
use custos_math::{cl_to_cpu_s, nn::SoftmaxOps};
use custos_math::{cpu_exec_lhs_rhs_mut, FnsOps, Matrix, RowOp};

#[cfg(feature = "cuda")]
use custos::{Read, CUDA};
#[cfg(feature = "cuda")]
use custos_math::{cu_to_cpu_lr, cu_to_cpu_s, cu_to_cpu_scalar, SumOps};

#[test]
fn test_swtich_mut_cl() -> custos::Result<()> {
    let device = OpenCL::new(0)?;
    let unified = device.unified_mem();
    device.set_unified_mem(false);

    let test = || {
        let mut matrix = Matrix::from((&device, 2, 3, [1., 2., 3., 4., 5., 6.]));
        let rhs = Matrix::from((&device, 1, 3, [1., 2., 3.]));
        cpu_exec_lhs_rhs_mut(&device, &mut matrix, &rhs, |cpu, matrix, rhs| {
            cpu.add_row_mut(matrix, rhs)
        })?;
        //custos::Result::Ok(matrix.read())
        custos::Result::Ok(matrix.read_to_vec())
    };

    assert_eq!(test()?, vec![2.0, 4.0, 6.0, 5.0, 7.0, 9.0]);
    if !unified {
        return Ok(());
    }
    device.set_unified_mem(true);
    assert_eq!(test()?, vec![2.0, 4.0, 6.0, 5.0, 7.0, 9.0]);
    Ok(())
}

#[test]
fn test_unified_mem_device_switch_exact() -> custos::Result<()> {
    use custos_math::{cpu_exec, Matrix};

    let device = OpenCL::new(0)?;

    let a = Matrix::from((&device, 2, 3, [1., 2., 3., 4., 5., 6.]));

    let start = std::time::Instant::now();
    for _ in range(10000) {
        let _m = cpu_exec(&device, &a, |cpu, m| cpu.neg(&m))?;
    }

    println!("duration: {:?}", start.elapsed());

    let m = cpu_exec(&device, &a, |cpu, m| cpu.neg(&m))?;
    println!("return m: {m:?}");
    Ok(())
}

#[test]
fn test_unified_mem_device_switch_softmax() -> custos::Result<()> {
    use custos_math::{cpu_exec, Matrix};

    let device = OpenCL::new(0)?;

    let a = Matrix::from((&device, 2, 3, [1., 2., 3., 4., 5., 6.]));

    let start = std::time::Instant::now();
    for _ in range(10000) {
        //let x: Matrix<f32> = (cached::<f32>(6), 2, 3).into();
        cl_to_cpu_s(&device, &a, |cpu, m| cpu.softmax(&m));
        //cpu_exec(&device, &a, |cpu, m| cpu.softmax(&m))?;
    }

    println!("duration: {:?}", start.elapsed());

    let m = cpu_exec(&device, &a, |cpu, m| cpu.ln(&m))?;
    println!("m: {m:?}");
    Ok(())
}

#[cfg(feature = "cuda")]
#[test]
fn test_switch_mut_cu() -> custos::Result<()> {
    use custos_math::cu_to_cpu_lr_mut;

    let device = custos::CUDA::new(0)?;

    let mut matrix = Matrix::from((&device, 2, 3, [1., 2., 3., 4., 5., 6.]));
    let rhs = Matrix::from((&device, 1, 3, [1., 2., 3.]));
    cu_to_cpu_lr_mut(&device, &mut matrix, &rhs, |cpu, matrix, rhs| {
        cpu.add_row_mut(matrix, rhs)
    });

    assert_eq!(matrix.read(), vec![2.0, 4.0, 6.0, 5.0, 7.0, 9.0]);

    Ok(())
}

#[cfg(feature = "cuda")]
#[test]
fn test_scalar_switch_cuda() -> custos::Result<()> {
    use custos_math::Matrix;

    let device = CUDA::new(0)?;
    let a = Matrix::from((&device, 3, 2, [1, 2, 3, 4, 5, 6]));
    let sum = cu_to_cpu_scalar(&a, |cpu, x| cpu.sum(&x));

    assert_eq!(sum, 21);

    Ok(())
}

#[cfg(feature = "cuda")]
#[test]
fn test_single_switch_cuda() -> custos::Result<()> {
    use custos_math::{FnsOps, Matrix};

    let device = CUDA::new(0)?;
    let a = Matrix::from((&device, 3, 2, [1., 2., 3., 4., 5., 6.]));
    let res = cu_to_cpu_s(&device, &a, |cpu, x| cpu.neg(&x));
    assert_eq!(device.read(&res), vec![-1., -2., -3., -4., -5., -6.]);
    Ok(())
}

#[cfg(feature = "cuda")]
#[test]
fn test_lr_switch_cuda() -> custos::Result<()> {
    use custos_math::{BaseOps, Matrix};

    let device = CUDA::new(0)?;
    let lhs = Matrix::from((&device, 3, 2, [1, 2, 3, 4, 5, 6]));
    let rhs = Matrix::from((&device, 3, 2, [2, 2, 3, 4, 5, 7]));

    let out = cu_to_cpu_lr(&device, &lhs, &rhs, |cpu, l, r| cpu.add(l, r));
    assert_eq!(device.read(&out), vec![3, 4, 6, 8, 10, 13]);
    Ok(())
}

#[cfg(feature = "opencl")]
#[test]
fn test_graph_opt_switchting_cl() -> custos::Result<()> {
    use custos::Buffer;

    let device = OpenCL::new(0)?;

    let _buf = Buffer::from((&device, [1, 2, 3, 4, 5, 6]));

    Ok(())
}