capture high-level GPU timing diagnostics
this shows that we spend about 2/3 of the GPU roundtrip time on stepping. the other 1/3 is i guess scheduling/latency/memory transfers.
This commit is contained in:
@@ -22,6 +22,8 @@ pub struct Diagnostics {
|
||||
time_reading_device: Duration,
|
||||
/// time during which CPU was transferring data to GPU
|
||||
time_writing_device: Duration,
|
||||
/// time during which the GPU was actively computing steps
|
||||
time_stepping_device: Duration,
|
||||
start_time: Instant,
|
||||
}
|
||||
|
||||
@@ -48,6 +50,7 @@ impl Diagnostics {
|
||||
time_blocked_on_render: Default::default(),
|
||||
time_reading_device: Default::default(),
|
||||
time_writing_device: Default::default(),
|
||||
time_stepping_device: Default::default(),
|
||||
start_time: Instant::now(),
|
||||
}
|
||||
}
|
||||
@@ -78,9 +81,11 @@ impl Diagnostics {
|
||||
other_time,
|
||||
);
|
||||
|
||||
let device_step_time = self.time_stepping_device.as_secs_f64();
|
||||
let device_write_time = self.time_writing_device.as_secs_f64();
|
||||
let device_read_time = self.time_reading_device.as_secs_f64();
|
||||
let device_line = format!("> gpu\twrite: {:.1}s, read: {:.1}s",
|
||||
let device_line = format!("> gpu\tstep: {:.1}s, write: {:.1}s, read: {:.1}s",
|
||||
device_step_time,
|
||||
device_write_time,
|
||||
device_read_time,
|
||||
);
|
||||
@@ -166,6 +171,9 @@ impl SyncDiagnostics {
|
||||
ret
|
||||
}
|
||||
|
||||
pub fn record_step_device(&self, t: Duration) {
|
||||
self.0.lock().unwrap().time_stepping_device += t;
|
||||
}
|
||||
pub fn instrument_read_device<R, F: FnOnce() -> R>(&self, f: F) -> R {
|
||||
let (elapsed, ret) = Self::measure(f);
|
||||
self.0.lock().unwrap().time_reading_device += elapsed;
|
||||
@@ -176,6 +184,5 @@ impl SyncDiagnostics {
|
||||
self.0.lock().unwrap().time_writing_device += elapsed;
|
||||
ret
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@@ -2,6 +2,7 @@ use futures::FutureExt as _;
|
||||
use log::info;
|
||||
use std::borrow::Cow;
|
||||
use std::num::NonZeroU64;
|
||||
use std::time::Duration;
|
||||
use wgpu;
|
||||
use wgpu::util::DeviceExt as _;
|
||||
|
||||
@@ -282,17 +283,28 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
|
||||
});
|
||||
|
||||
|
||||
// let timestamp_period = queue.get_timestamp_period();
|
||||
let timestamp_period = queue.get_timestamp_period();
|
||||
let timestamp_readback_slice = timestamp_buffer.slice(..);
|
||||
let timestamp_readback_future = timestamp_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
|
||||
{
|
||||
let (e_time, h_time) = {
|
||||
let mapped = timestamp_readback_slice.get_mapped_range();
|
||||
let timings: &[u64] = unsafe {
|
||||
from_bytes(mapped.as_ref())
|
||||
};
|
||||
println!("timings: {:?}", timings);
|
||||
}
|
||||
|
||||
let (mut e_time, mut h_time) = (0, 0);
|
||||
for frame in timings.chunks(4) {
|
||||
e_time += frame[1] - frame[0];
|
||||
h_time += frame[3] - frame[2];
|
||||
}
|
||||
|
||||
(
|
||||
Duration::from_nanos((e_time as f64 * timestamp_period as f64) as u64),
|
||||
Duration::from_nanos((h_time as f64 * timestamp_period as f64) as u64),
|
||||
)
|
||||
};
|
||||
timestamp_buffer.unmap();
|
||||
diag.record_step_device(e_time + h_time);
|
||||
});
|
||||
|
||||
// optimization note: it may be possible to use `WaitForSubmission`
|
||||
|
Reference in New Issue
Block a user