capture high-level GPU timing diagnostics

this shows that we spend about 2/3 of the GPU roundtrip time on
stepping. the other 1/3 is i guess scheduling/latency/memory transfers.
This commit is contained in:
2022-09-27 00:20:41 -07:00
parent 1387506511
commit a3d9b28ab6
2 changed files with 25 additions and 6 deletions

View File

@@ -22,6 +22,8 @@ pub struct Diagnostics {
time_reading_device: Duration,
/// time during which CPU was transferring data to GPU
time_writing_device: Duration,
/// time during which the GPU was actively computing steps
time_stepping_device: Duration,
start_time: Instant,
}
@@ -48,6 +50,7 @@ impl Diagnostics {
time_blocked_on_render: Default::default(),
time_reading_device: Default::default(),
time_writing_device: Default::default(),
time_stepping_device: Default::default(),
start_time: Instant::now(),
}
}
@@ -78,9 +81,11 @@ impl Diagnostics {
other_time,
);
let device_step_time = self.time_stepping_device.as_secs_f64();
let device_write_time = self.time_writing_device.as_secs_f64();
let device_read_time = self.time_reading_device.as_secs_f64();
let device_line = format!("> gpu\twrite: {:.1}s, read: {:.1}s",
let device_line = format!("> gpu\tstep: {:.1}s, write: {:.1}s, read: {:.1}s",
device_step_time,
device_write_time,
device_read_time,
);
@@ -166,6 +171,9 @@ impl SyncDiagnostics {
ret
}
pub fn record_step_device(&self, t: Duration) {
self.0.lock().unwrap().time_stepping_device += t;
}
pub fn instrument_read_device<R, F: FnOnce() -> R>(&self, f: F) -> R {
let (elapsed, ret) = Self::measure(f);
self.0.lock().unwrap().time_reading_device += elapsed;
@@ -176,6 +184,5 @@ impl SyncDiagnostics {
self.0.lock().unwrap().time_writing_device += elapsed;
ret
}
}

View File

@@ -2,6 +2,7 @@ use futures::FutureExt as _;
use log::info;
use std::borrow::Cow;
use std::num::NonZeroU64;
use std::time::Duration;
use wgpu;
use wgpu::util::DeviceExt as _;
@@ -282,17 +283,28 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
});
// let timestamp_period = queue.get_timestamp_period();
let timestamp_period = queue.get_timestamp_period();
let timestamp_readback_slice = timestamp_buffer.slice(..);
let timestamp_readback_future = timestamp_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
{
let (e_time, h_time) = {
let mapped = timestamp_readback_slice.get_mapped_range();
let timings: &[u64] = unsafe {
from_bytes(mapped.as_ref())
};
println!("timings: {:?}", timings);
}
let (mut e_time, mut h_time) = (0, 0);
for frame in timings.chunks(4) {
e_time += frame[1] - frame[0];
h_time += frame[3] - frame[2];
}
(
Duration::from_nanos((e_time as f64 * timestamp_period as f64) as u64),
Duration::from_nanos((h_time as f64 * timestamp_period as f64) as u64),
)
};
timestamp_buffer.unmap();
diag.record_step_device(e_time + h_time);
});
// optimization note: it may be possible to use `WaitForSubmission`