linebender · Shnatsel · Dec 11, 2025
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
@@ -168,6 +168,10 @@ impl Simd for Avx2 {
         unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { _mm_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         unsafe {
             _mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
@@ -1338,6 +1342,10 @@ impl Simd for Avx2 {
         unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { _mm_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         unsafe {
             _mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
@@ -1559,6 +1567,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        unsafe { _mm256_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         unsafe {
             _mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
@@ -3025,6 +3037,10 @@ impl Simd for Avx2 {
         unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        unsafe { _mm256_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         unsafe {
             _mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
@@ -3301,6 +3317,16 @@ impl Simd for Avx2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -4986,6 +5012,16 @@ impl Simd for Avx2 {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))

diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
@@ -311,6 +311,10 @@ impl Simd for Fallback {
         a.mul(b).sub(c)
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        c.sub(a.mul(b))
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         [
             f32::floor(a[0usize]),
@@ -3198,6 +3202,10 @@ impl Simd for Fallback {
         a.mul(b).sub(c)
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        c.sub(a.mul(b))
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         [f64::floor(a[0usize]), f64::floor(a[1usize])].simd_into(self)
     }
@@ -3471,6 +3479,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_neg_add_f32x4(a0, b0, c0),
+            self.mul_neg_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
@@ -5019,6 +5037,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_neg_add_f64x2(a0, b0, c0),
+            self.mul_neg_add_f64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
@@ -5315,6 +5343,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -6985,6 +7023,16 @@ impl Simd for Fallback {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))

diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
@@ -159,6 +159,10 @@ impl Simd for Neon {
         unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
+        unsafe { vfmsq_f32(c.into(), b.into(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
         unsafe { vrndmq_f32(a.into()).simd_into(self) }
     }
@@ -1227,6 +1231,10 @@ impl Simd for Neon {
         unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) }
     }
     #[inline(always)]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
+        unsafe { vfmsq_f64(c.into(), b.into(), a.into()).simd_into(self) }
+    }
+    #[inline(always)]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
         unsafe { vrndmq_f64(a.into()).simd_into(self) }
     }
@@ -1476,6 +1484,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
+        let (a0, a1) = self.split_f32x8(a);
+        let (b0, b1) = self.split_f32x8(b);
+        let (c0, c1) = self.split_f32x8(c);
+        self.combine_f32x4(
+            self.mul_neg_add_f32x4(a0, b0, c0),
+            self.mul_neg_add_f32x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
         let (a0, a1) = self.split_f32x8(a);
         self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
@@ -3011,6 +3029,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
+        let (a0, a1) = self.split_f64x4(a);
+        let (b0, b1) = self.split_f64x4(b);
+        let (c0, c1) = self.split_f64x4(c);
+        self.combine_f64x2(
+            self.mul_neg_add_f64x2(a0, b0, c0),
+            self.mul_neg_add_f64x2(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
         let (a0, a1) = self.split_f64x4(a);
         self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
@@ -3307,6 +3335,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
+        let (a0, a1) = self.split_f32x16(a);
+        let (b0, b1) = self.split_f32x16(b);
+        let (c0, c1) = self.split_f32x16(c);
+        self.combine_f32x8(
+            self.mul_neg_add_f32x8(a0, b0, c0),
+            self.mul_neg_add_f32x8(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
         let (a0, a1) = self.split_f32x16(a);
         self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
@@ -4816,6 +4854,16 @@ impl Simd for Neon {
         )
     }
     #[inline(always)]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
+        let (a0, a1) = self.split_f64x8(a);
+        let (b0, b1) = self.split_f64x8(b);
+        let (c0, c1) = self.split_f64x8(c);
+        self.combine_f64x4(
+            self.mul_neg_add_f64x4(a0, b0, c0),
+            self.mul_neg_add_f64x4(a1, b1, c1),
+        )
+    }
+    #[inline(always)]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
         let (a0, a1) = self.split_f64x8(a);
         self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))

diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
@@ -166,6 +166,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -659,6 +661,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -752,6 +756,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -1267,6 +1273,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -1364,6 +1372,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -1873,6 +1883,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
     fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
+    #[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
@@ -2024,6 +2036,8 @@ pub trait SimdFloat<Element: SimdElement, S: Simd>:
     fn mul_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
     #[doc = "Compute `(self * op1) - op2` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
     fn mul_sub(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
+    #[doc = "Compute `op2 - (self * op1)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
+    fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
     #[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
     fn floor(self) -> Self;
     #[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]