try printing out gpu timesteps

this seems to just print all 0 on my laptop. maybe it'll work better on a newer GPU. following the example in Embark's rust-gpu: runners/wgpu/src/compute.rs
2022-09-26 21:33:38 -07:00 · 2022-09-26 21:33:38 -07:00 · 1387506511
parent 6a7a6bc170
commit 1387506511
1 changed files with 46 additions and 3 deletions
--- a/crates/coremem/src/sim/spirv/gpu.rs
+++ b/crates/coremem/src/sim/spirv/gpu.rs
@ -83,6 +83,15 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
        let step_bind_group_layout = &handles.step_bind_group_layout;
        let step_e_pipeline = &handles.step_e_pipeline;
        let step_h_pipeline = &handles.step_h_pipeline;
+        let timestamp_buffer = device.create_buffer(&wgpu::BufferDescriptor {
+            label: Some("timestamps"),
+            // each timestamp is 8 bytes, and we do 4 per step
+            size: 8 * 4 * num_steps as u64,
+            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
+            mapped_at_creation: true,
+        });
+        timestamp_buffer.unmap();
+
        let sim_meta_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
            label: Some("gpu-side simulation metadata"),
            contents: to_bytes(&[meta][..]),
@ -187,24 +196,34 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
            ],
        });

+        let queries = device.create_query_set(&wgpu::QuerySetDescriptor {
+            label: None,
+            count: 4 * num_steps,
+            ty: wgpu::QueryType::Timestamp,
+        });
+
        let workgroups = ((dim.x()+3) / 4, (dim.y()+3) / 4, (dim.z()+3) / 4);

        let mut encoder =
            device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });

-        for _ in 0..num_steps {
+        for step in 0..num_steps {
            {
                let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: None });
                cpass.set_bind_group(0, &bind_group, &[]);
                cpass.set_pipeline(&step_e_pipeline);
+                cpass.write_timestamp(&queries, 4*step);
                cpass.dispatch(workgroups.0, workgroups.1, workgroups.2);
+                cpass.write_timestamp(&queries, 4*step + 1);
            }

            {
                let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: None });
                cpass.set_bind_group(0, &bind_group, &[]);
                cpass.set_pipeline(&step_h_pipeline);
+                cpass.write_timestamp(&queries, 4*step + 2);
                cpass.dispatch(workgroups.0, workgroups.1, workgroups.2);
+                cpass.write_timestamp(&queries, 4*step + 3);
            }
        }

@ -232,6 +251,8 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
            field_bytes as u64,
        );

+        encoder.resolve_query_set(&queries, 0..4*num_steps, &timestamp_buffer, 0);
+
        diag.instrument_write_device(move || {
            queue.submit(Some(encoder.finish()));
        });
@ -260,6 +281,20 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
            m_readback_buffer.unmap();
        });

+
+        // let timestamp_period = queue.get_timestamp_period();
+        let timestamp_readback_slice = timestamp_buffer.slice(..);
+        let timestamp_readback_future = timestamp_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
+            {
+                let mapped = timestamp_readback_slice.get_mapped_range();
+                let timings: &[u64] = unsafe {
+                    from_bytes(mapped.as_ref())
+                };
+                println!("timings: {:?}", timings);
+            }
+            timestamp_buffer.unmap();
+        });
+
        // optimization note: it may be possible to use `WaitForSubmission`
        // and copy data to/from even as the GPU begins executing the next job.
        device.poll(wgpu::Maintain::Wait);
@ -267,7 +302,11 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
        diag.instrument_read_device(move || {
            futures::executor::block_on(futures::future::join(
                e_readback_future, futures::future::join(
-                    h_readback_future, m_readback_future)));
+                    h_readback_future, futures::future::join(
+                        m_readback_future, timestamp_readback_future
+                    )
+                )
+            ));
        });
    }
 }
@ -327,7 +366,11 @@ async fn open_device(max_buf_size: u64) -> (wgpu::Device, wgpu::Queue) {
        .request_device(
            &wgpu::DeviceDescriptor {
                label: None,
-                features: wgpu::Features::SPIRV_SHADER_PASSTHROUGH,
+                features: (
+                    wgpu::Features::empty()
+                        .union(wgpu::Features::SPIRV_SHADER_PASSTHROUGH)
+                        .union(wgpu::Features::TIMESTAMP_QUERY)
+                ),
                limits,
            },
            None,