capture high-level GPU timing diagnostics

this shows that we spend about 2/3 of the GPU roundtrip time on stepping. the other 1/3 is i guess scheduling/latency/memory transfers.
2022-09-27 00:20:41 -07:00
parent 1387506511
commit a3d9b28ab6
2 changed files with 25 additions and 6 deletions
--- a/crates/coremem/src/diagnostics.rs
+++ b/crates/coremem/src/diagnostics.rs
@@ -22,6 +22,8 @@ pub struct Diagnostics {
    time_reading_device: Duration,
    /// time during which CPU was transferring data to GPU
    time_writing_device: Duration,
+    /// time during which the GPU was actively computing steps
+    time_stepping_device: Duration,
    start_time: Instant,
 }

@@ -48,6 +50,7 @@ impl Diagnostics {
            time_blocked_on_render: Default::default(),
            time_reading_device: Default::default(),
            time_writing_device: Default::default(),
+            time_stepping_device: Default::default(),
            start_time: Instant::now(),
        }
    }
@@ -78,9 +81,11 @@ impl Diagnostics {
            other_time,
        );

+        let device_step_time = self.time_stepping_device.as_secs_f64();
        let device_write_time = self.time_writing_device.as_secs_f64();
        let device_read_time = self.time_reading_device.as_secs_f64();
-        let device_line = format!("> gpu\twrite: {:.1}s, read: {:.1}s",
+        let device_line = format!("> gpu\tstep: {:.1}s, write: {:.1}s, read: {:.1}s",
+            device_step_time,
            device_write_time,
            device_read_time,
        );
@@ -166,6 +171,9 @@ impl SyncDiagnostics {
        ret
    }

+    pub fn record_step_device(&self, t: Duration) {
+        self.0.lock().unwrap().time_stepping_device += t;
+    }
    pub fn instrument_read_device<R, F: FnOnce() -> R>(&self, f: F) -> R {
        let (elapsed, ret) = Self::measure(f);
        self.0.lock().unwrap().time_reading_device += elapsed;
@@ -176,6 +184,5 @@ impl SyncDiagnostics {
        self.0.lock().unwrap().time_writing_device += elapsed;
        ret
    }
-
 }

--- a/crates/coremem/src/sim/spirv/gpu.rs
+++ b/crates/coremem/src/sim/spirv/gpu.rs
@@ -2,6 +2,7 @@ use futures::FutureExt as _;
 use log::info;
 use std::borrow::Cow;
 use std::num::NonZeroU64;
+use std::time::Duration;
 use wgpu;
 use wgpu::util::DeviceExt as _;

@@ -282,17 +283,28 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
        });


-        // let timestamp_period = queue.get_timestamp_period();
+        let timestamp_period = queue.get_timestamp_period();
        let timestamp_readback_slice = timestamp_buffer.slice(..);
        let timestamp_readback_future = timestamp_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
-            {
+            let (e_time, h_time) = {
                let mapped = timestamp_readback_slice.get_mapped_range();
                let timings: &[u64] = unsafe {
                    from_bytes(mapped.as_ref())
                };
-                println!("timings: {:?}", timings);
-            }
+
+                let (mut e_time, mut h_time) = (0, 0);
+                for frame in timings.chunks(4) {
+                    e_time += frame[1] - frame[0];
+                    h_time += frame[3] - frame[2];
+                }
+
+                (
+                    Duration::from_nanos((e_time as f64 * timestamp_period as f64) as u64),
+                    Duration::from_nanos((h_time as f64 * timestamp_period as f64) as u64),
+                )
+            };
            timestamp_buffer.unmap();
+            diag.record_step_device(e_time + h_time);
        });

        // optimization note: it may be possible to use `WaitForSubmission`