Remove redundant manual loop unrolling from activations and element-wise ops

João Felipe Santos · João Felipe Santos · commit 704f30959f1e · 2026-02-16T10:38:14.000-08:00
ARM assembly analysis (-O2 -DNDEBUG) confirmed:
  - GCC auto-unrolls simple activation loops; manual 4-wide gives no benefit
  - expf() serializes sigmoid/SiLU; unrolling can't help
  - Eigen element-wise ops (.leftCols + .leftCols) produce identical codegen
    to raw float* loops when assertions are disabled

  Simplify 5 activation classes to use inline helpers (relu, sigmoid, etc.)
  and revert 3 wavenet element-wise operations back to Eigen expressions.

  Inline GEMM (Conv1x1/Conv1D), depthwise unrolling, FiLM unrolling,
  bias broadcast, and memcpy optimizations are retained — those show
  measurable wins on both desktop and Cortex-M7.

Also restored comments that were accidentally removed from wavenet.h.
diff --git a/NAM/activations.cpp b/NAM/activations.cpp
@@ -37,7 +37,6 @@ std::unordered_map<std::string, nam::activations::Activation::Ptr> nam::activati
 
 nam::activations::Activation::Ptr tanh_bak = nullptr;
 nam::activations::Activation::Ptr sigmoid_bak = nullptr;
-nam::activations::Activation::Ptr silu_bak = nullptr;
 
 nam::activations::Activation::Ptr nam::activations::Activation::get_activation(const std::string name)
 {
@@ -198,14 +197,9 @@ void nam::activations::Activation::enable_lut(std::string function_name, float m
     fn = sigmoid;
     sigmoid_bak = _activations["Sigmoid"];
   }
-  else if (function_name == "SiLU")
-  {
-    fn = swish;
-    silu_bak = _activations["SiLU"];
-  }
   else
   {
-    throw std::runtime_error("Tried to enable LUT for a function other than Tanh, Sigmoid, or SiLU");
+    throw std::runtime_error("Tried to enable LUT for a function other than Tanh or Sigmoid");
   }
   _activations[function_name] = std::make_shared<FastLUTActivation>(min, max, n_points, fn);
 }
@@ -220,12 +214,8 @@ void nam::activations::Activation::disable_lut(std::string function_name)
   {
     _activations["Sigmoid"] = sigmoid_bak;
   }
-  else if (function_name == "SiLU")
-  {
-    _activations["SiLU"] = silu_bak;
-  }
   else
   {
-    throw std::runtime_error("Tried to disable LUT for a function other than Tanh, Sigmoid, or SiLU");
+    throw std::runtime_error("Tried to disable LUT for a function other than Tanh or Sigmoid");
   }
 }
diff --git a/NAM/activations.h b/NAM/activations.h
@@ -235,24 +235,8 @@ class ActivationReLU : public Activation
 public:
   void apply(float* data, long size) override
   {
-    // Optimized ReLU with loop unrolling
-    long pos = 0;
-    // Process 4 elements at a time
-    for (; pos + 3 < size; pos += 4)
-    {
-      // Branchless ReLU using conditional
-      const float v0 = data[pos], v1 = data[pos + 1];
-      const float v2 = data[pos + 2], v3 = data[pos + 3];
-      data[pos]     = v0 > 0.0f ? v0 : 0.0f;
-      data[pos + 1] = v1 > 0.0f ? v1 : 0.0f;
-      data[pos + 2] = v2 > 0.0f ? v2 : 0.0f;
-      data[pos + 3] = v3 > 0.0f ? v3 : 0.0f;
-    }
-    // Handle remainder
-    for (; pos < size; pos++)
-    {
-      data[pos] = data[pos] > 0.0f ? data[pos] : 0.0f;
-    }
+    for (long pos = 0; pos < size; pos++)
+      data[pos] = relu(data[pos]);
   }
 };
 
@@ -316,23 +300,8 @@ class ActivationSigmoid : public Activation
 public:
   void apply(float* data, long size) override
   {
-    long pos = 0;
-    // Process 4 elements at a time
-    for (; pos + 3 < size; pos += 4)
-    {
-      const float x0 = data[pos], x1 = data[pos + 1];
-      const float x2 = data[pos + 2], x3 = data[pos + 3];
-
-      data[pos]     = 1.0f / (1.0f + expf(-x0));
-      data[pos + 1] = 1.0f / (1.0f + expf(-x1));
-      data[pos + 2] = 1.0f / (1.0f + expf(-x2));
-      data[pos + 3] = 1.0f / (1.0f + expf(-x3));
-    }
-    // Handle remainder
-    for (; pos < size; pos++)
-    {
+    for (long pos = 0; pos < size; pos++)
       data[pos] = sigmoid(data[pos]);
-    }
   }
 };
 
@@ -341,28 +310,8 @@ class ActivationSwish : public Activation
 public:
   void apply(float* data, long size) override
   {
-    long pos = 0;
-    // Process 4 elements at a time: swish(x) = x * sigmoid(x) = x / (1 + exp(-x))
-    for (; pos + 3 < size; pos += 4)
-    {
-      const float x0 = data[pos], x1 = data[pos + 1];
-      const float x2 = data[pos + 2], x3 = data[pos + 3];
-
-      const float s0 = 1.0f / (1.0f + expf(-x0));
-      const float s1 = 1.0f / (1.0f + expf(-x1));
-      const float s2 = 1.0f / (1.0f + expf(-x2));
-      const float s3 = 1.0f / (1.0f + expf(-x3));
-
-      data[pos]     = x0 * s0;
-      data[pos + 1] = x1 * s1;
-      data[pos + 2] = x2 * s2;
-      data[pos + 3] = x3 * s3;
-    }
-    // Handle remainder
-    for (; pos < size; pos++)
-    {
+    for (long pos = 0; pos < size; pos++)
       data[pos] = swish(data[pos]);
-    }
   }
 };
 
@@ -371,32 +320,8 @@ class ActivationHardSwish : public Activation
 public:
   void apply(float* data, long size) override
   {
-    const float inv6 = 1.0f / 6.0f;
-    long pos = 0;
-    // Process 4 elements at a time
-    for (; pos + 3 < size; pos += 4)
-    {
-      const float x0 = data[pos], x1 = data[pos + 1];
-      const float x2 = data[pos + 2], x3 = data[pos + 3];
-
-      const float t0 = x0 + 3.0f, t1 = x1 + 3.0f;
-      const float t2 = x2 + 3.0f, t3 = x3 + 3.0f;
-
-      const float c0 = t0 < 0.0f ? 0.0f : (t0 > 6.0f ? 6.0f : t0);
-      const float c1 = t1 < 0.0f ? 0.0f : (t1 > 6.0f ? 6.0f : t1);
-      const float c2 = t2 < 0.0f ? 0.0f : (t2 > 6.0f ? 6.0f : t2);
-      const float c3 = t3 < 0.0f ? 0.0f : (t3 > 6.0f ? 6.0f : t3);
-
-      data[pos]     = x0 * c0 * inv6;
-      data[pos + 1] = x1 * c1 * inv6;
-      data[pos + 2] = x2 * c2 * inv6;
-      data[pos + 3] = x3 * c3 * inv6;
-    }
-    // Handle remainder
-    for (; pos < size; pos++)
-    {
+    for (long pos = 0; pos < size; pos++)
       data[pos] = hardswish(data[pos]);
-    }
   }
 };
 
@@ -405,23 +330,8 @@ class ActivationSoftsign : public Activation
 public:
   void apply(float* data, long size) override
   {
-    long pos = 0;
-    // Process 4 elements at a time
-    for (; pos + 3 < size; pos += 4)
-    {
-      const float x0 = data[pos], x1 = data[pos + 1];
-      const float x2 = data[pos + 2], x3 = data[pos + 3];
-
-      data[pos]     = x0 / (1.0f + fabsf(x0));
-      data[pos + 1] = x1 / (1.0f + fabsf(x1));
-      data[pos + 2] = x2 / (1.0f + fabsf(x2));
-      data[pos + 3] = x3 / (1.0f + fabsf(x3));
-    }
-    // Handle remainder
-    for (; pos < size; pos++)
-    {
+    for (long pos = 0; pos < size; pos++)
       data[pos] = softsign(data[pos]);
-    }
   }
 };
 
diff --git a/NAM/wavenet.cpp b/NAM/wavenet.cpp
@@ -124,33 +124,8 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
     Eigen::MatrixXf& input_mixin_output = this->_input_mixin.GetOutput();
     this->_input_mixin_post_film->Process_(input_mixin_output, condition, num_frames);
   }
-#ifdef NAM_USE_INLINE_GEMM
-  // Optimized matrix addition for small channel counts
-  {
-    const int channels = (int)_conv.get_out_channels();
-    const float* __restrict__ conv_ptr = _conv.GetOutput().data();
-    const float* __restrict__ mixin_ptr = _input_mixin.GetOutput().data();
-    float* __restrict__ z_ptr = this->_z.data();
-    const int total = channels * num_frames;
-
-    // Unrolled addition
-    int i = 0;
-    for (; i + 3 < total; i += 4)
-    {
-      z_ptr[i] = conv_ptr[i] + mixin_ptr[i];
-      z_ptr[i + 1] = conv_ptr[i + 1] + mixin_ptr[i + 1];
-      z_ptr[i + 2] = conv_ptr[i + 2] + mixin_ptr[i + 2];
-      z_ptr[i + 3] = conv_ptr[i + 3] + mixin_ptr[i + 3];
-    }
-    for (; i < total; i++)
-    {
-      z_ptr[i] = conv_ptr[i] + mixin_ptr[i];
-    }
-  }
-#else
   this->_z.leftCols(num_frames).noalias() =
     _conv.GetOutput().leftCols(num_frames) + _input_mixin.GetOutput().leftCols(num_frames);
-#endif
 
   if (this->_activation_pre_film)
   {
@@ -282,28 +257,8 @@ void nam::wavenet::_Layer::Process(const Eigen::MatrixXf& input, const Eigen::Ma
   // Store output to next layer (residual connection: input + layer1x1 output, or just input if layer1x1 inactive)
   if (this->_layer1x1)
   {
-#ifdef NAM_USE_INLINE_GEMM
-    {
-      const int channels = (int)this->get_channels();
-      const int total = channels * num_frames;
-      const float* __restrict__ in_ptr = input.data();
-      const float* __restrict__ layer_ptr = this->_layer1x1->GetOutput().data();
-      float* __restrict__ dst = this->_output_next_layer.data();
-      int i = 0;
-      for (; i + 3 < total; i += 4)
-      {
-        dst[i] = in_ptr[i] + layer_ptr[i];
-        dst[i + 1] = in_ptr[i + 1] + layer_ptr[i + 1];
-        dst[i + 2] = in_ptr[i + 2] + layer_ptr[i + 2];
-        dst[i + 3] = in_ptr[i + 3] + layer_ptr[i + 3];
-      }
-      for (; i < total; i++)
-        dst[i] = in_ptr[i] + layer_ptr[i];
-    }
-#else
     this->_output_next_layer.leftCols(num_frames).noalias() =
       input.leftCols(num_frames) + this->_layer1x1->GetOutput().leftCols(num_frames);
-#endif
   }
   else
   {
@@ -415,26 +370,7 @@ void nam::wavenet::_LayerArray::ProcessInner(const Eigen::MatrixXf& layer_inputs
     }
 
     // Accumulate head output from this layer
-#ifdef NAM_USE_INLINE_GEMM
-    {
-      const int channels = (int)this->_head_output_size;
-      const int total = channels * num_frames;
-      const float* __restrict__ src = this->_layers[i].GetOutputHead().data();
-      float* __restrict__ dst = this->_head_inputs.data();
-      int j = 0;
-      for (; j + 3 < total; j += 4)
-      {
-        dst[j] += src[j];
-        dst[j + 1] += src[j + 1];
-        dst[j + 2] += src[j + 2];
-        dst[j + 3] += src[j + 3];
-      }
-      for (; j < total; j++)
-        dst[j] += src[j];
-    }
-#else
     this->_head_inputs.leftCols(num_frames).noalias() += this->_layers[i].GetOutputHead().leftCols(num_frames);
-#endif
   }
 
   // Store output from last layer - use memcpy for pure copy
diff --git a/NAM/wavenet.h b/NAM/wavenet.h
@@ -385,9 +385,11 @@ class _Layer
   std::unique_ptr<Conv1x1> _layer1x1;
   // The post-activation 1x1 convolution outputting to the head, optional
   std::unique_ptr<Conv1x1> _head1x1;
-
+  // The internal state
   Eigen::MatrixXf _z;
+  // Output to next layer (residual connection: input + layer1x1 output, or just input if layer1x1 inactive)
   Eigen::MatrixXf _output_next_layer;
+  // Output to head (skip connection: activated conv output)
   Eigen::MatrixXf _output_head;
 
   activations::Activation::Ptr _activation;
@@ -604,12 +606,12 @@ class _LayerArray
 
   // The layer objects
   std::vector<_Layer> _layers;
-
+  // Output from last layer (for next layer array)
   Eigen::MatrixXf _layer_outputs;
-  Eigen::MatrixXf _head_inputs;
-
   // Accumulated head inputs from all layers
   // Size is _head_output_size (= head1x1.out_channels if head1x1 active, else bottleneck)
+  Eigen::MatrixXf _head_inputs;
+
   // Rechannel for the head (_head_output_size -> head_size)
   Conv1x1 _head_rechannel;
 
@@ -668,9 +670,9 @@ class WaveNet : public DSP
   void set_weights_(std::vector<float>::iterator& weights);
 
 protected:
+  // Element-wise arrays:
   Eigen::MatrixXf _condition_input;
   Eigen::MatrixXf _condition_output;
-
   std::unique_ptr<DSP> _condition_dsp;
   // Temporary buffers for condition DSP processing (to avoid allocations in _process_condition)
   std::vector<std::vector<NAM_SAMPLE>> _condition_dsp_input_buffers;

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,6 @@ std::unordered_map<std::string, nam::activations::Activation::Ptr> nam::activati`
`37`	`37`
`38`	`38`	`nam::activations::Activation::Ptr tanh_bak = nullptr;`
`39`	`39`	`nam::activations::Activation::Ptr sigmoid_bak = nullptr;`
`40`		`-nam::activations::Activation::Ptr silu_bak = nullptr;`
`41`	`40`
`42`	`41`	`nam::activations::Activation::Ptr nam::activations::Activation::get_activation(const std::string name)`
`43`	`42`	`{`
`@@ -198,14 +197,9 @@ void nam::activations::Activation::enable_lut(std::string function_name, float m`
`198`	`197`	`fn = sigmoid;`
`199`	`198`	`sigmoid_bak = _activations["Sigmoid"];`
`200`	`199`	`}`
`201`		`- else if (function_name == "SiLU")`
`202`		`- {`
`203`		`- fn = swish;`
`204`		`- silu_bak = _activations["SiLU"];`
`205`		`- }`
`206`	`200`	`else`
`207`	`201`	`{`
`208`		`- throw std::runtime_error("Tried to enable LUT for a function other than Tanh, Sigmoid, or SiLU");`
	`202`	`+ throw std::runtime_error("Tried to enable LUT for a function other than Tanh or Sigmoid");`
`209`	`203`	`}`
`210`	`204`	`_activations[function_name] = std::make_shared<FastLUTActivation>(min, max, n_points, fn);`
`211`	`205`	`}`
`@@ -220,12 +214,8 @@ void nam::activations::Activation::disable_lut(std::string function_name)`
`220`	`214`	`{`
`221`	`215`	`_activations["Sigmoid"] = sigmoid_bak;`
`222`	`216`	`}`
`223`		`- else if (function_name == "SiLU")`
`224`		`- {`
`225`		`- _activations["SiLU"] = silu_bak;`
`226`		`- }`
`227`	`217`	`else`
`228`	`218`	`{`
`229`		`- throw std::runtime_error("Tried to disable LUT for a function other than Tanh, Sigmoid, or SiLU");`
	`219`	`+ throw std::runtime_error("Tried to disable LUT for a function other than Tanh or Sigmoid");`
`230`	`220`	`}`
`231`	`221`	`}`