use std::ops::Div;
use nalgebra::{DVector, Vector4};
use wgcore::composer::ComposerExt;
use wgcore::gpu::GpuInstance;
use wgcore::kernel::{KernelInvocationBuilder, KernelInvocationQueue};
use wgcore::tensor::{GpuScalar, GpuVector};
use wgcore::Shader;
use wgpu::{BufferUsages, ComputePipeline};
use wgcore::hot_reloading::HotReloadState;
use wgcore::timestamps::GpuTimestamps;

#[derive(Shader)]
#[shader(
    src = "timestamp_queries.wgsl",
    composable = false
)]
struct ShaderTimestampQueries {
    main: ComputePipeline
}

#[async_std::main]
async fn main() -> anyhow::Result<()> {
    // Initialize the gpu device and its queue.
    //
    // Note that `GpuInstance` is just a simple helper struct for initializing the gpu resources.
    // You are free to initialize them independently if more control is needed, or reuse the ones
    // that were already created/owned by e.g., a game engine.
    let gpu = GpuInstance::new().await?;

    // Load and compile our kernel. The `from_device` function was generated by the `Shader` derive.
    // Note that its dependency to `Composable` is automatically resolved by the `Shader` derive
    // too.
    let mut kernel = ShaderTimestampQueries::from_device(gpu.device())?;

    // Create the buffers.
    const LEN: u32 = 2_000_000;
    let buffer = GpuVector::init(gpu.device(), vec![0u32; LEN as usize], BufferUsages::STORAGE | BufferUsages::COPY_SRC);

    // Init hot-reloading.
    // We are setting up hot-reloading so that we can change somme elements in the shader
    // (like the iteration count) and see how that affects performances live.
    let mut hot_reload = HotReloadState::new()?;
    ShaderTimestampQueries::watch_sources(&mut hot_reload)?;

    // Init timestamp queries.
    // To measure the time of one kernel, we need two timestamps (one for when it starts and one for
    // when it stopped).
    let mut timestamps = GpuTimestamps::new(gpu.device(), 2);

    // Queue the operation.
    println!("#############################");
    println!("Edit the file `timestamp_queries.wgsl` (for example by multiplying or dividing NUM_ITERS by 10).\nThe updated runtime will be printed below whenever a change is detected.");
    println!("#############################");

    for loop_id in 0.. {
        // Detect & apply changes.
        hot_reload.update_changes();
        match kernel.reload_if_changed(gpu.device(), &hot_reload) {
            Ok(changed) => {
                if changed {
                    // Clear the timestamps to reuse in the next loop.
                    timestamps.clear();
                    // We detected a change (or this is the first loop).
                    // Read the result.
                    let mut queue = KernelInvocationQueue::new(gpu.device());
                    // Declare a compute pass with timestamps enabled.
                    queue.compute_pass("timestamp_queries_test", true);
                    KernelInvocationBuilder::new(&mut queue, &kernel.main)
                        .bind0([buffer.buffer()])
                        .queue(LEN.div_ceil(64));

                    // Encode & submit the operation to the gpu.
                    let mut encoder = gpu.device().create_command_encoder(&Default::default());
                    // Run our kernel.
                    queue.encode(&mut encoder, Some(&mut timestamps));
                    // Resolve the timestamp queries.
                    timestamps.resolve(&mut encoder);
                    gpu.queue().submit(Some(encoder.finish()));

                    // Read and print the kernel’s runtime.
                    let timestamps_read = timestamps.wait_for_results_ms(gpu.device(), gpu.queue());
                    println!("Current run time: {}ms", timestamps_read[1] - timestamps_read[0]);
                }
            }
            Err(e) => {
                // Hot-reloading failed, likely due to a syntax error in the shader.
                println!("Hot reloading error: {:?}", e);
            }
        }
    }

    Ok(())
}