spirv: instrument the device read/write operations

2022-08-21 18:51:51 -07:00
parent 6c9a6e1ffa
commit 98d6a5b34f
4 changed files with 48 additions and 13 deletions
--- a/crates/coremem/src/diagnostics.rs
+++ b/crates/coremem/src/diagnostics.rs
@@ -11,6 +11,8 @@ pub struct Diagnostics {
    time_rendering: Duration,
    time_blocked_on_stim: Duration,
    time_blocked_on_render: Duration,
+    time_reading_device: Duration,
+    time_writing_device: Duration,
    start_time: Instant,
 }

@@ -33,6 +35,8 @@ impl Diagnostics {
            time_rendering: Default::default(),
            time_blocked_on_stim: Default::default(),
            time_blocked_on_render: Default::default(),
+            time_reading_device: Default::default(),
+            time_writing_device: Default::default(),
            start_time: Instant::now(),
        }
    }
@@ -49,7 +53,7 @@ impl Diagnostics {
        let other_time = overall_time - step_time - stim_block_time - render_block_time - render_prep_time;
        let fps = (self.frames_completed as f64) / overall_time;

-        format!("fps: {:6.2} (step: {:.1}s, [stim: {:.1}s render: {:.1}s], blocked: (stim: {:.1}s render: {:.1}s), render_prep: {:.1}s, other: {:.1}s)",
+        let main_line = format!("fps: {:6.2} (step: {:.1}s, [stim: {:.1}s render: {:.1}s], blocked: (stim: {:.1}s render: {:.1}s), render_prep: {:.1}s, other: {:.1}s)",
            fps,
            step_time,
            stim_time,
@@ -58,7 +62,15 @@ impl Diagnostics {
            render_block_time,
            render_prep_time,
            other_time,
-        )
+        );
+        let device_write_time = self.time_writing_device.as_secs_f64();
+        let device_read_time = self.time_reading_device.as_secs_f64();
+        let sub_line = format!("gpu> write: {:.1}s read: {:.1}s",
+            device_write_time,
+            device_read_time,
+        );
+
+        format!("{}\n  {}", main_line, sub_line)
    }
 }

@@ -121,5 +133,16 @@ impl SyncDiagnostics {
        ret
    }

+    pub fn instrument_read_device<R, F: FnOnce() -> R>(&self, f: F) -> R {
+        let (elapsed, ret) = Self::measure(f);
+        self.0.lock().unwrap().time_reading_device += elapsed;
+        ret
+    }
+    pub fn instrument_write_device<R, F: FnOnce() -> R>(&self, f: F) -> R {
+        let (elapsed, ret) = Self::measure(f);
+        self.0.lock().unwrap().time_writing_device += elapsed;
+        ret
+    }
+
 }

--- a/crates/coremem/src/sim/spirv/cpu.rs
+++ b/crates/coremem/src/sim/spirv/cpu.rs
@@ -1,3 +1,5 @@
+use crate::diagnostics::SyncDiagnostics;
+
 use coremem_cross::mat::Material;
 use coremem_cross::real::Real;
 use coremem_cross::step::{SimMeta, StepEContext, StepHContext};
@@ -11,6 +13,7 @@ pub struct CpuBackend;
 impl<R: Real, M: Material<R>> SimBackend<R, M> for CpuBackend {
    fn step_n(
        &mut self,
+        _diag: &SyncDiagnostics,
        meta: SimMeta<R>,
        mat: &[M],
        stim_e: &[Vec3<R>],
--- a/crates/coremem/src/sim/spirv/gpu.rs
+++ b/crates/coremem/src/sim/spirv/gpu.rs
@@ -5,6 +5,8 @@ use std::num::NonZeroU64;
 use wgpu;
 use wgpu::util::DeviceExt as _;

+use crate::diagnostics::SyncDiagnostics;
+
 use coremem_cross::vec::{Vec3, Vec3u};
 use coremem_cross::step::SimMeta;

@@ -55,6 +57,7 @@ impl WgpuHandles {
 impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBackend {
    fn step_n(
        &mut self,
+        diag: &SyncDiagnostics,
        meta: SimMeta<R>,
        mat: &[M],
        stim_cpu_e: &[Vec3<R>],
@@ -228,15 +231,17 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
            field_bytes as u64,
        );

-        queue.submit(Some(encoder.finish()));
+        diag.instrument_write_device(move || {
+            queue.submit(Some(encoder.finish()));
+        });

        let e_readback_slice = e_readback_buffer.slice(..);
        let e_readback_future = e_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
-                e.copy_from_slice(unsafe {
-                    from_bytes(e_readback_slice.get_mapped_range().as_ref())
-                });
-                e_readback_buffer.unmap();
+            e.copy_from_slice(unsafe {
+                from_bytes(e_readback_slice.get_mapped_range().as_ref())
            });
+            e_readback_buffer.unmap();
+        });

        let h_readback_slice = h_readback_buffer.slice(..);
        let h_readback_future = h_readback_slice.map_async(wgpu::MapMode::Read).then(|_| async {
@@ -254,11 +259,15 @@ impl<R: Copy, M: Send + Sync + HasEntryPoints<R>> SimBackend<R, M> for WgpuBacke
            m_readback_buffer.unmap();
        });

+        // optimization note: it may be possible to use `WaitForSubmission`
+        // and copy data to/from even as the GPU begins executing the next job.
        device.poll(wgpu::Maintain::Wait);

-        futures::executor::block_on(futures::future::join(
-            e_readback_future, futures::future::join(
-                h_readback_future, m_readback_future)));
+        diag.instrument_read_device(move || {
+            futures::executor::block_on(futures::future::join(
+                e_readback_future, futures::future::join(
+                    h_readback_future, m_readback_future)));
+        });
    }
 }

--- a/crates/coremem/src/sim/spirv/mod.rs
+++ b/crates/coremem/src/sim/spirv/mod.rs
@@ -24,6 +24,7 @@ pub use gpu::WgpuBackend;
 pub trait SimBackend<R, M> {
    fn step_n(
        &mut self,
+        diag: &SyncDiagnostics,
        meta: SimMeta<R>,
        mat: &[M],
        stim_e: &[Vec3<R>],
@@ -101,6 +102,7 @@ where
    fn step_multiple<S: Stimulus>(&mut self, num_steps: u32, stim: &S) {
        let (stim_e, stim_h) = self.eval_stimulus(stim);
        self.backend.step_n(
+            &self.diag,
            self.meta,
            self.mat.as_slice(),
            &*stim_e,
@@ -226,8 +228,7 @@ where
    fn eval_stimulus<'a, S: Stimulus>(&self, stim: &'a S)
        -> (&'a [Vec3<R>], &'a [Vec3<R>])
    {
-        trace!("eval_stimulus begin");
-        let (e, h) = self.diag.instrument_stimuli(|| {
+        let (e, h) = self.diag.instrument_stimuli_blocked(|| {
            let dim = self.size();
            let dim_len = dim.product_sum_usize();
            let feature_size = self.feature_size();
@@ -242,7 +243,6 @@ where
            // TODO: find a way to remove this
            unsafe { std::mem::transmute((e, h)) }
        });
-        trace!("eval_stimulus end");
        (e, h)
    }
 }