Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions fearless_simd/src/generated/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,10 @@ impl Simd for Avx2 {
unsafe { _mm_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
unsafe { _mm_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe {
_mm_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
Expand Down Expand Up @@ -1338,6 +1342,10 @@ impl Simd for Avx2 {
unsafe { _mm_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
unsafe { _mm_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe {
_mm_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into()).simd_into(self)
Expand Down Expand Up @@ -1559,6 +1567,10 @@ impl Simd for Avx2 {
unsafe { _mm256_fmsub_ps(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
unsafe { _mm256_fnmadd_ps(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
unsafe {
_mm256_round_ps::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
Expand Down Expand Up @@ -3025,6 +3037,10 @@ impl Simd for Avx2 {
unsafe { _mm256_fmsub_pd(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
unsafe { _mm256_fnmadd_pd(a.into(), b.into(), c.into()).simd_into(self) }
}
#[inline(always)]
fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
unsafe {
_mm256_round_pd::<{ _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC }>(a.into())
Expand Down Expand Up @@ -3301,6 +3317,16 @@ impl Simd for Avx2 {
)
}
#[inline(always)]
fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
let (c0, c1) = self.split_f32x16(c);
self.combine_f32x8(
self.mul_neg_add_f32x8(a0, b0, c0),
self.mul_neg_add_f32x8(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
Expand Down Expand Up @@ -4986,6 +5012,16 @@ impl Simd for Avx2 {
)
}
#[inline(always)]
fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
let (c0, c1) = self.split_f64x8(c);
self.combine_f64x4(
self.mul_neg_add_f64x4(a0, b0, c0),
self.mul_neg_add_f64x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
Expand Down
48 changes: 48 additions & 0 deletions fearless_simd/src/generated/fallback.rs
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,10 @@ impl Simd for Fallback {
a.mul(b).sub(c)
}
#[inline(always)]
fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
c.sub(a.mul(b))
}
#[inline(always)]
fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
[
f32::floor(a[0usize]),
Expand Down Expand Up @@ -3198,6 +3202,10 @@ impl Simd for Fallback {
a.mul(b).sub(c)
}
#[inline(always)]
fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
c.sub(a.mul(b))
}
#[inline(always)]
fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
[f64::floor(a[0usize]), f64::floor(a[1usize])].simd_into(self)
}
Expand Down Expand Up @@ -3471,6 +3479,16 @@ impl Simd for Fallback {
)
}
#[inline(always)]
fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
let (c0, c1) = self.split_f32x8(c);
self.combine_f32x4(
self.mul_neg_add_f32x4(a0, b0, c0),
self.mul_neg_add_f32x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
Expand Down Expand Up @@ -5019,6 +5037,16 @@ impl Simd for Fallback {
)
}
#[inline(always)]
fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
let (c0, c1) = self.split_f64x4(c);
self.combine_f64x2(
self.mul_neg_add_f64x2(a0, b0, c0),
self.mul_neg_add_f64x2(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
Expand Down Expand Up @@ -5315,6 +5343,16 @@ impl Simd for Fallback {
)
}
#[inline(always)]
fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
let (c0, c1) = self.split_f32x16(c);
self.combine_f32x8(
self.mul_neg_add_f32x8(a0, b0, c0),
self.mul_neg_add_f32x8(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
Expand Down Expand Up @@ -6985,6 +7023,16 @@ impl Simd for Fallback {
)
}
#[inline(always)]
fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
let (c0, c1) = self.split_f64x8(c);
self.combine_f64x4(
self.mul_neg_add_f64x4(a0, b0, c0),
self.mul_neg_add_f64x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
Expand Down
48 changes: 48 additions & 0 deletions fearless_simd/src/generated/neon.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ impl Simd for Neon {
unsafe { vnegq_f32(vfmsq_f32(c.into(), b.into(), a.into())).simd_into(self) }
}
#[inline(always)]
fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self> {
unsafe { vfmsq_f32(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self> {
unsafe { vrndmq_f32(a.into()).simd_into(self) }
}
Expand Down Expand Up @@ -1227,6 +1231,10 @@ impl Simd for Neon {
unsafe { vnegq_f64(vfmsq_f64(c.into(), b.into(), a.into())).simd_into(self) }
}
#[inline(always)]
fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self> {
unsafe { vfmsq_f64(c.into(), b.into(), a.into()).simd_into(self) }
}
#[inline(always)]
fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self> {
unsafe { vrndmq_f64(a.into()).simd_into(self) }
}
Expand Down Expand Up @@ -1476,6 +1484,16 @@ impl Simd for Neon {
)
}
#[inline(always)]
fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
let (b0, b1) = self.split_f32x8(b);
let (c0, c1) = self.split_f32x8(c);
self.combine_f32x4(
self.mul_neg_add_f32x4(a0, b0, c0),
self.mul_neg_add_f32x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
let (a0, a1) = self.split_f32x8(a);
self.combine_f32x4(self.floor_f32x4(a0), self.floor_f32x4(a1))
Expand Down Expand Up @@ -3011,6 +3029,16 @@ impl Simd for Neon {
)
}
#[inline(always)]
fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
let (b0, b1) = self.split_f64x4(b);
let (c0, c1) = self.split_f64x4(c);
self.combine_f64x2(
self.mul_neg_add_f64x2(a0, b0, c0),
self.mul_neg_add_f64x2(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
let (a0, a1) = self.split_f64x4(a);
self.combine_f64x2(self.floor_f64x2(a0), self.floor_f64x2(a1))
Expand Down Expand Up @@ -3307,6 +3335,16 @@ impl Simd for Neon {
)
}
#[inline(always)]
fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
let (b0, b1) = self.split_f32x16(b);
let (c0, c1) = self.split_f32x16(c);
self.combine_f32x8(
self.mul_neg_add_f32x8(a0, b0, c0),
self.mul_neg_add_f32x8(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self> {
let (a0, a1) = self.split_f32x16(a);
self.combine_f32x8(self.floor_f32x8(a0), self.floor_f32x8(a1))
Expand Down Expand Up @@ -4816,6 +4854,16 @@ impl Simd for Neon {
)
}
#[inline(always)]
fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
let (b0, b1) = self.split_f64x8(b);
let (c0, c1) = self.split_f64x8(c);
self.combine_f64x4(
self.mul_neg_add_f64x4(a0, b0, c0),
self.mul_neg_add_f64x4(a1, b1, c1),
)
}
#[inline(always)]
fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self> {
let (a0, a1) = self.split_f64x8(a);
self.combine_f64x4(self.floor_f64x4(a0), self.floor_f64x4(a1))
Expand Down
14 changes: 14 additions & 0 deletions fearless_simd/src/generated/simd_trait.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
fn mul_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
#[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
#[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>, c: f32x4<Self>) -> f32x4<Self>;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down Expand Up @@ -659,6 +661,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
fn mul_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
#[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
#[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>, c: f64x2<Self>) -> f64x2<Self>;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down Expand Up @@ -752,6 +756,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
fn mul_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
#[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
#[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>, c: f32x8<Self>) -> f32x8<Self>;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down Expand Up @@ -1267,6 +1273,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
fn mul_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
#[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
#[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>, c: f64x4<Self>) -> f64x4<Self>;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down Expand Up @@ -1364,6 +1372,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
fn mul_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
#[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
#[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>, c: f32x16<Self>) -> f32x16<Self>;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down Expand Up @@ -1873,6 +1883,8 @@ pub trait Simd: Sized + Clone + Copy + Send + Sync + Seal + 'static {
fn mul_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
#[doc = "Compute `(a * b) - c` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
#[doc = "Compute `c - (a * b)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>, c: f64x8<Self>) -> f64x8<Self>;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down Expand Up @@ -2024,6 +2036,8 @@ pub trait SimdFloat<Element: SimdElement, S: Simd>:
fn mul_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
#[doc = "Compute `(self * op1) - op2` (fused multiply-subtract) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a subtract, which will result in two rounding errors."]
fn mul_sub(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
#[doc = "Compute `op2 - (self * op1)` (fused negated multiply-add) for each element.\n\nDepending on hardware support, the result may be computed with only one rounding error, or may be implemented as a regular multiply followed by a negated add, which will result in two rounding errors."]
fn mul_neg_add(self, op1: impl SimdInto<Self, S>, op2: impl SimdInto<Self, S>) -> Self;
#[doc = "Return the largest integer less than or equal to each element, that is, round towards negative infinity."]
fn floor(self) -> Self;
#[doc = "Return the smallest integer greater than or equal to each element, that is, round towards positive infinity."]
Expand Down
Loading