use nalgebra::DVector;
use wgcore::composer::ComposerExt;
use wgcore::gpu::GpuInstance;
use wgcore::kernel::{KernelInvocationBuilder, KernelInvocationQueue};
use wgcore::tensor::GpuVector;
use wgcore::Shader;
use wgpu::{BufferUsages, ComputePipeline};

// Declare our shader module that contains our composable functions.
// Note that we don’t build any compute pipeline from this wgsl file.
#[derive(Shader)]
#[shader(
    src = "compose_dependency.wgsl" // Shader source code, will be embedded in the exe with `include_str!`
)]
struct Composable;

#[derive(Shader)]
#[shader(
    derive(Composable), // This shader depends on the `Composable` shader.
    src = "compose_kernel.wgsl",  // Shader source code, will be embedded in the exe with `include_str!`.
    composable = false    // This shader doesn’t export any symbols reusable from other wgsl shaders.
)]
struct WgKernel {
    // This ComputePipeline field indicates that the Shader macro needs to generate the boilerplate
    // for loading the compute pipeline in `WgKernel::from_device`.
    main: ComputePipeline,
}

#[derive(Copy, Clone, PartialEq, Debug, Default, bytemuck::Pod, bytemuck::Zeroable)]
#[repr(C)]
pub struct MyStruct {
    value: f32,
}

#[async_std::main]
async fn main() -> anyhow::Result<()> {
    // Initialize the gpu device and its queue.
    //
    // Note that `GpuInstance` is just a simple helper struct for initializing the gpu resources.
    // You are free to initialize them independently if more control is needed, or reuse the ones
    // that were already created/owned by e.g., a game engine.
    let gpu = GpuInstance::new().await?;

    // Load and compile our kernel. The `from_device` function was generated by the `Shader` derive.
    // Note that its dependency to `Composable` is automatically resolved by the `Shader` derive
    // too.
    let kernel = WgKernel::from_device(gpu.device())?;
    println!("######################################");
    println!("###### Composed shader sources: ######");
    println!("######################################");
    println!("{}", WgKernel::flat_wgsl()?);

    // Create the buffers.
    const LEN: u32 = 1000;
    let a_data = DVector::from_fn(LEN as usize, |i, _| MyStruct { value: i as f32 });
    let b_data = DVector::from_fn(LEN as usize, |i, _| MyStruct {
        value: i as f32 * 10.0,
    });
    let a_buf = GpuVector::init(gpu.device(), &a_data, BufferUsages::STORAGE);
    let b_buf = GpuVector::init(gpu.device(), &b_data, BufferUsages::STORAGE);

    // Queue the operation.
    let mut queue = KernelInvocationQueue::new(gpu.device());
    KernelInvocationBuilder::new(&mut queue, &kernel.main)
        .bind0([a_buf.buffer(), b_buf.buffer()])
        .queue(LEN.div_ceil(64));

    // Encode & submit the operation to the gpu.
    let mut encoder = gpu.device().create_command_encoder(&Default::default());
    queue.encode(&mut encoder, None);
    gpu.queue().submit(Some(encoder.finish()));

    Ok(())
}