try printing out gpu timesteps

this seems to just print all 0 on my laptop.
maybe it'll work better on a newer GPU.
following the example in Embark's rust-gpu:
runners/wgpu/src/compute.rs
This commit is contained in:
colin 2022-09-26 21:33:38 -07:00
parent 6a7a6bc170
commit 1387506511
1 changed files with 46 additions and 3 deletions

View File

@ -83,6 +83,15 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
let step_bind_group_layout = &handles.step_bind_group_layout;
let step_e_pipeline = &handles.step_e_pipeline;
let step_h_pipeline = &handles.step_h_pipeline;
let timestamp_buffer = device.create_buffer(&wgpu::BufferDescriptor {
label: Some("timestamps"),
// each timestamp is 8 bytes, and we do 4 per step
size: 8 * 4 * num_steps as u64,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: true,
});
timestamp_buffer.unmap();
let sim_meta_buffer = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
label: Some("gpu-side simulation metadata"),
contents: to_bytes(&[meta][..]),
@ -187,24 +196,34 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
],
});
let queries = device.create_query_set(&wgpu::QuerySetDescriptor {
label: None,
count: 4 * num_steps,
ty: wgpu::QueryType::Timestamp,
});
let workgroups = ((dim.x()+3) / 4, (dim.y()+3) / 4, (dim.z()+3) / 4);
let mut encoder =
device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None });
for _ in 0..num_steps {
for step in 0..num_steps {
{
let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: None });
cpass.set_bind_group(0, &bind_group, &[]);
cpass.set_pipeline(&step_e_pipeline);
cpass.write_timestamp(&queries, 4*step);
cpass.dispatch(workgroups.0, workgroups.1, workgroups.2);
cpass.write_timestamp(&queries, 4*step + 1);
}
{
let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor { label: None });
cpass.set_bind_group(0, &bind_group, &[]);
cpass.set_pipeline(&step_h_pipeline);
cpass.write_timestamp(&queries, 4*step + 2);
cpass.dispatch(workgroups.0, workgroups.1, workgroups.2);
cpass.write_timestamp(&queries, 4*step + 3);
}
}
@ -232,6 +251,8 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
field_bytes as u64,
);
encoder.resolve_query_set(&queries, 0..4*num_steps, &timestamp_buffer, 0);
diag.instrument_write_device(move || {
queue.submit(Some(encoder.finish()));
});
@ -260,6 +281,20 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
m_readback_buffer.unmap();
});
// let timestamp_period = queue.get_timestamp_period();
let timestamp_readback_slice = timestamp_buffer.slice(..);
let timestamp_readback_future = timestamp_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
{
let mapped = timestamp_readback_slice.get_mapped_range();
let timings: &[u64] = unsafe {
from_bytes(mapped.as_ref())
};
println!("timings: {:?}", timings);
}
timestamp_buffer.unmap();
});
// optimization note: it may be possible to use `WaitForSubmission`
// and copy data to/from even as the GPU begins executing the next job.
device.poll(wgpu::Maintain::Wait);
@ -267,7 +302,11 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
diag.instrument_read_device(move || {
futures::executor::block_on(futures::future::join(
e_readback_future, futures::future::join(
h_readback_future, m_readback_future)));
h_readback_future, futures::future::join(
m_readback_future, timestamp_readback_future
)
)
));
});
}
}
@ -327,7 +366,11 @@ async fn open_device(max_buf_size: u64) -> (wgpu::Device, wgpu::Queue) {
.request_device(
&wgpu::DeviceDescriptor {
label: None,
features: wgpu::Features::SPIRV_SHADER_PASSTHROUGH,
features: (
wgpu::Features::empty()
.union(wgpu::Features::SPIRV_SHADER_PASSTHROUGH)
.union(wgpu::Features::TIMESTAMP_QUERY)
),
limits,
},
None,