From 82af4b100d1471275d5497b2f79c96a76c8c55cc Mon Sep 17 00:00:00 2001
From: colin <colin@uninsane.org>
Date: Mon, 22 Aug 2022 01:07:27 -0700
Subject: [PATCH] driver: optimize the `step_multiple` step count calculation

this was using a stupid amount of compute.
we still have about 7% time unaccounted for. down from 12-15%.
---
 crates/coremem/src/driver.rs | 32 ++++++++++++++++++--------------
 crates/coremem/src/render.rs | 11 +++++++++++
 2 files changed, 29 insertions(+), 14 deletions(-)
diff --git a/crates/coremem/src/driver.rs b/crates/coremem/src/driver.rs
index 32fcb7b..d6b4e33 100644
--- a/crates/coremem/src/driver.rs
+++ b/crates/coremem/src/driver.rs
@@ -258,23 +258,26 @@ where
             self.render();
         }
 
-        // TODO: optimize. this single block takes as much as 12% of net execution time
-        let mut can_step = 1;
-        while can_step < at_most
-            && !self.renderer.any_work_for_frame(start_step + can_step as u64)
-            && !self.stimuli.any_work_for_frame(start_step + can_step as u64)
-        {
-            can_step += 1;
-        }
+        // maybe the renderer or stimulus needs servicing before the max frame the user asked for.
+        // step less than `at_most`, in that case.
+        let next_frame_for_user = start_step + at_most as u64;
+        let next_frame_to_render = self.renderer.next_frame_for_work(start_step);
+        let next_frame_for_stim = self.stimuli.next_frame_for_work(start_step);
+        let step_to = [Some(next_frame_for_user), next_frame_to_render, Some(next_frame_for_stim)]
+            .into_iter()
+            .flatten()
+            .min()
+            .unwrap();
+        let steps_this_time = (step_to - start_step).try_into().unwrap();
 
         let meta = self.state.meta();
         let stim = self.stimuli.get_for(meta, start_step);
         // prefetch the next stimulus, in the background.
-        self.stimuli.start_job(meta, start_step + can_step as u64);
+        self.stimuli.start_job(meta, step_to);
 
         trace!("step begin");
-        self.diag.instrument_step(can_step as u64, || {
-            self.state.step_multiple(can_step, &stim);
+        self.diag.instrument_step(steps_this_time as u64, || {
+            self.state.step_multiple(steps_this_time, &stim);
         });
         trace!("step end");
 
@@ -291,7 +294,7 @@ where
                 percent_complete, sim_time, step, diagstr
             );
         }
-        can_step as u32
+        steps_this_time
     }
     pub fn step_multiple(&mut self, num_steps: u32) {
         let mut steps_remaining = num_steps;
@@ -475,8 +478,9 @@ impl<R, T> StimAccess<R, T> {
         // with the worker joined, there should be no outstanding handles on the arc.
         Arc::try_unwrap(self.stim).ok().unwrap().into_inner().unwrap()
     }
-    fn any_work_for_frame(&self, frame: u64) -> bool {
-        frame % self.steps_per_stimulus == 0
+    fn next_frame_for_work(&self, after: u64) -> u64 {
+        let f = after + self.steps_per_stimulus;
+        f - f % self.steps_per_stimulus
     }
 
     /// used internally.
diff --git a/crates/coremem/src/render.rs b/crates/coremem/src/render.rs
index b56a033..2d513bd 100644
--- a/crates/coremem/src/render.rs
+++ b/crates/coremem/src/render.rs
@@ -535,6 +535,14 @@ impl<S> MultiRendererElement<S> {
             Some(end) => frame < end,
         }
     }
+    fn next_frame_for_work(&self, after: u64) -> Option<u64> {
+        let max_frame = after + self.step_frequency;
+        let max_frame = max_frame - max_frame % self.step_frequency;
+        match self.step_limit {
+            None => Some(max_frame),
+            Some(end) => Some(max_frame).filter(|&f| f < end)
+        }
+    }
 }
 
 pub struct MultiRenderer<S> {
@@ -567,6 +575,9 @@ impl<S> MultiRenderer<S> {
     pub fn any_work_for_frame(&self, frame: u64) -> bool {
         self.renderers.read().unwrap().iter().any(|m| m.work_this_frame(frame))
     }
+    pub fn next_frame_for_work(&self, after: u64) -> Option<u64> {
+        self.renderers.read().unwrap().iter().flat_map(|m| m.next_frame_for_work(after)).min()
+    }
 }
 
 impl<S: AbstractSim> Renderer<S> for MultiRenderer<S> {