From e89127fec752d38bf9c786a004a7b0dac9e26083 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Fri, 12 Dec 2025 20:24:49 -0500 Subject: [PATCH 1/4] Add "slide" operation --- fearless_simd/src/generated/avx2.rs | 1274 +++++++++++- fearless_simd/src/generated/fallback.rs | 708 +++++++ fearless_simd/src/generated/neon.rs | 1791 +++++++++++++++++ fearless_simd/src/generated/simd_trait.rs | 357 ++++ fearless_simd/src/generated/simd_types.rs | 360 ++++ fearless_simd/src/generated/sse4_2.rs | 1141 +++++++++++ fearless_simd/src/generated/wasm.rs | 1155 +++++++++++ fearless_simd/src/support.rs | 21 + fearless_simd_dev_macros/src/lib.rs | 11 +- fearless_simd_gen/src/generic.rs | 28 +- fearless_simd_gen/src/level.rs | 9 + fearless_simd_gen/src/mk_fallback.rs | 11 + fearless_simd_gen/src/mk_neon.rs | 94 +- fearless_simd_gen/src/mk_simd_trait.rs | 22 + fearless_simd_gen/src/mk_simd_types.rs | 11 + fearless_simd_gen/src/mk_wasm.rs | 106 +- fearless_simd_gen/src/mk_x86.rs | 200 +- fearless_simd_gen/src/ops.rs | 61 +- fearless_simd_tests/tests/harness/mod.rs | 492 +++++ .../tests/harness/slide_exhaustive.rs | 292 +++ 20 files changed, 8127 insertions(+), 17 deletions(-) create mode 100644 fearless_simd_tests/tests/harness/slide_exhaustive.rs diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index bab26fde..6503426d 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -146,6 +146,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f32x4(b).val.0, + self.cvt_to_bytes_f32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } } @@ -409,6 +434,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i8x16(b).val.0, + self.cvt_to_bytes_i8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } @@ -600,6 +650,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u8x16(b).val.0, + self.cvt_to_bytes_u8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } @@ -797,6 +872,35 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask8x16(b).val.0, + self.cvt_to_bytes_mask8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + self.slide_mask8x16::(a, b) + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -894,6 +998,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i16x8(b).val.0, + self.cvt_to_bytes_i16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } @@ -1060,6 +1189,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u16x8(b).val.0, + self.cvt_to_bytes_u16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } @@ -1232,6 +1386,35 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask16x8(b).val.0, + self.cvt_to_bytes_mask16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + self.slide_mask16x8::(a, b) + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1329,6 +1512,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i32x4(b).val.0, + self.cvt_to_bytes_i32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } @@ -1497,6 +1705,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u32x4(b).val.0, + self.cvt_to_bytes_u32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } @@ -1677,6 +1910,35 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask32x4(b).val.0, + self.cvt_to_bytes_mask32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + self.slide_mask32x4::(a, b) + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1774,6 +2036,31 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f64x2(b).val.0, + self.cvt_to_bytes_f64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } } @@ -1966,6 +2253,35 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask64x2(b).val.0, + self.cvt_to_bytes_mask64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + self.slide_mask64x2::(a, b) + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2063,6 +2379,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { unsafe { _mm256_andnot_ps(_mm256_set1_ps(-0.0), a.into()).simd_into(self) } } @@ -2366,6 +2720,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } } @@ -2597,6 +2989,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { unsafe { _mm256_add_epi8(a.into(), b.into()).simd_into(self) } } @@ -2839,6 +3269,48 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask8x32(b).val.0, + self.cvt_to_bytes_mask8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask8x32(b).val.0, + self.cvt_to_bytes_mask8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -2952,6 +3424,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } } @@ -3160,6 +3670,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { unsafe { _mm256_add_epi16(a.into(), b.into()).simd_into(self) } } @@ -3386,6 +3934,48 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask16x16(b).val.0, + self.cvt_to_bytes_mask16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask16x16(b).val.0, + self.cvt_to_bytes_mask16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -3499,6 +4089,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } } @@ -3699,6 +4327,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { unsafe { _mm256_add_epi32(a.into(), b.into()).simd_into(self) } } @@ -3905,12 +4571,54 @@ impl Simd for Avx2 { } } #[inline(always)] - fn cvt_to_bytes_mask32x8(self, a: mask32x8) -> u8x32 { + fn cvt_to_bytes_mask32x8(self, a: mask32x8) -> u8x32 { + unsafe { + u8x32 { + val: core::mem::transmute(a.val), + simd: self, + } + } + } + #[inline(always)] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask32x8(b).val.0, + self.cvt_to_bytes_mask32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { unsafe { - u8x32 { - val: core::mem::transmute(a.val), - simd: self, + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask32x8(b).val.0, + self.cvt_to_bytes_mask32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } } #[inline(always)] @@ -4023,6 +4731,44 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { unsafe { _mm256_andnot_pd(_mm256_set1_pd(-0.0), a.into()).simd_into(self) } } @@ -4255,6 +5001,48 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_256x1( + self.cvt_to_bytes_mask64x4(b).val.0, + self.cvt_to_bytes_mask64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_256( + self.cvt_to_bytes_mask64x4(b).val.0, + self.cvt_to_bytes_mask64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) } } @@ -4375,6 +5163,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) @@ -4731,6 +5549,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); @@ -4957,6 +5805,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); @@ -5228,6 +6106,40 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_mask8x64(b).val.0, + self.cvt_to_bytes_mask8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32( + self.slide_within_blocks_mask8x32::(a0, b0), + self.slide_within_blocks_mask8x32::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -5364,6 +6276,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), + ) + } + #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); @@ -5599,6 +6541,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); @@ -5897,6 +6869,40 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_mask16x32(b).val.0, + self.cvt_to_bytes_mask16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.slide_within_blocks_mask16x16::(a0, b0), + self.slide_within_blocks_mask16x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -6036,6 +7042,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); @@ -6267,6 +7303,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); @@ -6530,6 +7596,40 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_mask32x16(b).val.0, + self.cvt_to_bytes_mask32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8( + self.slide_within_blocks_mask32x8::(a0, b0), + self.slide_within_blocks_mask32x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -6666,6 +7766,36 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) @@ -6930,6 +8060,40 @@ impl Simd for Avx2 { } } #[inline(always)] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_256x2( + self.cvt_to_bytes_mask64x8(b).val.0, + self.cvt_to_bytes_mask64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4( + self.slide_within_blocks_mask64x4::(a0, b0), + self.slide_within_blocks_mask64x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); @@ -7187,3 +8351,105 @@ impl From> for __m256i { unsafe { core::mem::transmute_copy(&value.val) } } } +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { + unsafe { + match shift { + 0usize => _mm_alignr_epi8::<0i32>(a, b), + 1usize => _mm_alignr_epi8::<1i32>(a, b), + 2usize => _mm_alignr_epi8::<2i32>(a, b), + 3usize => _mm_alignr_epi8::<3i32>(a, b), + 4usize => _mm_alignr_epi8::<4i32>(a, b), + 5usize => _mm_alignr_epi8::<5i32>(a, b), + 6usize => _mm_alignr_epi8::<6i32>(a, b), + 7usize => _mm_alignr_epi8::<7i32>(a, b), + 8usize => _mm_alignr_epi8::<8i32>(a, b), + 9usize => _mm_alignr_epi8::<9i32>(a, b), + 10usize => _mm_alignr_epi8::<10i32>(a, b), + 11usize => _mm_alignr_epi8::<11i32>(a, b), + 12usize => _mm_alignr_epi8::<12i32>(a, b), + 13usize => _mm_alignr_epi8::<13i32>(a, b), + 14usize => _mm_alignr_epi8::<14i32>(a, b), + 15usize => _mm_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i { + unsafe { + match shift { + 0usize => _mm256_alignr_epi8::<0i32>(a, b), + 1usize => _mm256_alignr_epi8::<1i32>(a, b), + 2usize => _mm256_alignr_epi8::<2i32>(a, b), + 3usize => _mm256_alignr_epi8::<3i32>(a, b), + 4usize => _mm256_alignr_epi8::<4i32>(a, b), + 5usize => _mm256_alignr_epi8::<5i32>(a, b), + 6usize => _mm256_alignr_epi8::<6i32>(a, b), + 7usize => _mm256_alignr_epi8::<7i32>(a, b), + 8usize => _mm256_alignr_epi8::<8i32>(a, b), + 9usize => _mm256_alignr_epi8::<9i32>(a, b), + 10usize => _mm256_alignr_epi8::<10i32>(a, b), + 11usize => _mm256_alignr_epi8::<11i32>(a, b), + 12usize => _mm256_alignr_epi8::<12i32>(a, b), + 13usize => _mm256_alignr_epi8::<13i32>(a, b), + 14usize => _mm256_alignr_epi8::<14i32>(a, b), + 15usize => _mm256_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."] +#[doc = r""] +#[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"] +#[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."] +#[inline(always)] +unsafe fn cross_block_alignr_one( + regs: &[__m256i], + block_idx: usize, + shift_bytes: usize, +) -> __m256i { + let lo_idx = block_idx + (shift_bytes / 16); + let intra_shift = shift_bytes % 16; + let lo_blocks = if lo_idx % 2 == 0 { + regs[lo_idx / 2] + } else { + unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } + }; + let hi_idx = lo_idx + 1; + let hi_blocks = if hi_idx % 2 == 0 { + regs[hi_idx / 2] + } else { + unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) } + }; + unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } +} +#[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"] +#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] +#[inline(always)] +unsafe fn cross_block_alignr_256x2( + a: [__m256i; 2], + b: [__m256i; 2], + shift_bytes: usize, +) -> [__m256i; 2] { + let regs = [b[0], b[1], a[0], a[1]]; + unsafe { + [ + cross_block_alignr_one(®s, 0, shift_bytes), + cross_block_alignr_one(®s, 2, shift_bytes), + ] + } +} +#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"] +#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] +#[inline(always)] +unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { + let regs = [b, a]; + unsafe { cross_block_alignr_one(®s, 0, shift_bytes) } +} diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs index 1701a140..92bec131 100644 --- a/fearless_simd/src/generated/fallback.rs +++ b/fearless_simd/src/generated/fallback.rs @@ -199,6 +199,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { [ f32::abs(a[0usize]), @@ -552,6 +567,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { [ i8::wrapping_add(a[0usize], b[0usize]), @@ -1114,6 +1144,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { [ u8::wrapping_add(a[0usize], b[0usize]), @@ -1672,6 +1717,25 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + self.slide_mask8x16::(a, b) + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { [ i8::bitand(a[0usize], &b[0usize]), @@ -1964,6 +2028,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { [ i16::wrapping_add(a[0usize], b[0usize]), @@ -2335,6 +2414,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { [ u16::wrapping_add(a[0usize], b[0usize]), @@ -2692,6 +2786,25 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + self.slide_mask16x8::(a, b) + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { [ i16::bitand(a[0usize], &b[0usize]), @@ -2880,6 +2993,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { [ i32::wrapping_add(a[0usize], b[0usize]), @@ -3169,6 +3297,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { [ u32::wrapping_add(a[0usize], b[0usize]), @@ -3444,6 +3587,25 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + self.slide_mask32x4::(a, b) + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { [ i32::bitand(a[0usize], &b[0usize]), @@ -3580,6 +3742,21 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + let mut dest = [Default::default(); 2usize]; + dest[..2usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[2usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { [f64::abs(a[0usize]), f64::abs(a[1usize])].simd_into(self) } @@ -3819,6 +3996,25 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + let mut dest = [Default::default(); 2usize]; + dest[..2usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[2usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + self.slide_mask64x2::(a, b) + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { [ i64::bitand(a[0usize], &b[0usize]), @@ -3940,6 +4136,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) @@ -4243,6 +4459,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + let mut dest = [Default::default(); 32usize]; + dest[..32usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[32usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), + ) + } + #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); @@ -4461,6 +4697,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + let mut dest = [Default::default(); 32usize]; + dest[..32usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[32usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), + ) + } + #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); @@ -4674,6 +4930,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + let mut dest = [Default::default(); 32usize]; + dest[..32usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[32usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16( + self.slide_within_blocks_mask8x16::(a0, b0), + self.slide_within_blocks_mask8x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -4802,6 +5082,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), + ) + } + #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); @@ -5020,6 +5320,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), + ) + } + #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); @@ -5255,6 +5575,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8( + self.slide_within_blocks_mask16x8::(a0, b0), + self.slide_within_blocks_mask16x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -5383,6 +5727,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), + ) + } + #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); @@ -5606,6 +5970,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), + ) + } + #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); @@ -5816,6 +6200,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4( + self.slide_within_blocks_mask32x4::(a0, b0), + self.slide_within_blocks_mask32x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -5944,6 +6352,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) @@ -6200,6 +6628,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + let mut dest = [Default::default(); 4usize]; + dest[..4usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[4usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2( + self.slide_within_blocks_mask64x2::(a0, b0), + self.slide_within_blocks_mask64x2::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -6328,6 +6780,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) @@ -6654,6 +7126,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + let mut dest = [Default::default(); 64usize]; + dest[..64usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[64usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); @@ -6865,6 +7357,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + let mut dest = [Default::default(); 64usize]; + dest[..64usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[64usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); @@ -7151,6 +7663,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + let mut dest = [Default::default(); 64usize]; + dest[..64usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[64usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32( + self.slide_within_blocks_mask8x32::(a0, b0), + self.slide_within_blocks_mask8x32::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -7272,6 +7808,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + let mut dest = [Default::default(); 32usize]; + dest[..32usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[32usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), + ) + } + #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); @@ -7492,6 +8048,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + let mut dest = [Default::default(); 32usize]; + dest[..32usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[32usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); @@ -7760,6 +8336,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + let mut dest = [Default::default(); 32usize]; + dest[..32usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[32usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.slide_within_blocks_mask16x16::(a0, b0), + self.slide_within_blocks_mask16x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -7884,6 +8484,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); @@ -8100,6 +8720,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); @@ -8333,6 +8973,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + let mut dest = [Default::default(); 16usize]; + dest[..16usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[16usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8( + self.slide_within_blocks_mask32x8::(a0, b0), + self.slide_within_blocks_mask32x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -8454,6 +9118,26 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) @@ -8703,6 +9387,30 @@ impl Simd for Fallback { } } #[inline(always)] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + let mut dest = [Default::default(); 8usize]; + dest[..8usize - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[8usize - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + #[inline(always)] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4( + self.slide_within_blocks_mask64x4::(a0, b0), + self.slide_within_blocks_mask64x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index 8e432c59..bc2c063d 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -138,6 +138,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + if SHIFT >= 4usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_f32x4(a).val.0, + self.cvt_to_bytes_f32x4(b).val.0, + SHIFT * 4usize, + ) + }; + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { unsafe { vabsq_f32(a.into()).simd_into(self) } } @@ -356,6 +381,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_i8x16(a).val.0, + self.cvt_to_bytes_i8x16(b).val.0, + SHIFT, + ) + }; + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { vaddq_s8(a.into(), b.into()).simd_into(self) } } @@ -523,6 +573,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_u8x16(a).val.0, + self.cvt_to_bytes_u8x16(b).val.0, + SHIFT, + ) + }; + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { vaddq_u8(a.into(), b.into()).simd_into(self) } } @@ -690,6 +765,35 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_mask8x16(a).val.0, + self.cvt_to_bytes_mask8x16(b).val.0, + SHIFT, + ) + }; + self.cvt_from_bytes_mask8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + self.slide_mask8x16::(a, b) + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { vandq_s8(a.into(), b.into()).simd_into(self) } } @@ -790,6 +894,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_i16x8(a).val.0, + self.cvt_to_bytes_i16x8(b).val.0, + SHIFT * 2usize, + ) + }; + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { vaddq_s16(a.into(), b.into()).simd_into(self) } } @@ -957,6 +1086,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_u16x8(a).val.0, + self.cvt_to_bytes_u16x8(b).val.0, + SHIFT * 2usize, + ) + }; + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { vaddq_u16(a.into(), b.into()).simd_into(self) } } @@ -1120,6 +1274,35 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_mask16x8(a).val.0, + self.cvt_to_bytes_mask16x8(b).val.0, + SHIFT * 2usize, + ) + }; + self.cvt_from_bytes_mask16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + self.slide_mask16x8::(a, b) + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { vandq_s16(a.into(), b.into()).simd_into(self) } } @@ -1220,6 +1403,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + if SHIFT >= 4usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_i32x4(a).val.0, + self.cvt_to_bytes_i32x4(b).val.0, + SHIFT * 4usize, + ) + }; + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { vaddq_s32(a.into(), b.into()).simd_into(self) } } @@ -1391,6 +1599,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + if SHIFT >= 4usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_u32x4(a).val.0, + self.cvt_to_bytes_u32x4(b).val.0, + SHIFT * 4usize, + ) + }; + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { vaddq_u32(a.into(), b.into()).simd_into(self) } } @@ -1554,6 +1787,35 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + if SHIFT >= 4usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_mask32x4(a).val.0, + self.cvt_to_bytes_mask32x4(b).val.0, + SHIFT * 4usize, + ) + }; + self.cvt_from_bytes_mask32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + self.slide_mask32x4::(a, b) + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { vandq_s32(a.into(), b.into()).simd_into(self) } } @@ -1654,6 +1916,31 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + if SHIFT >= 2usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_f64x2(a).val.0, + self.cvt_to_bytes_f64x2(b).val.0, + SHIFT * 8usize, + ) + }; + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { unsafe { vabsq_f64(a.into()).simd_into(self) } } @@ -1844,6 +2131,35 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + if SHIFT >= 2usize { + return b; + } + let result = unsafe { + dyn_vext_128( + self.cvt_to_bytes_mask64x2(a).val.0, + self.cvt_to_bytes_mask64x2(b).val.0, + SHIFT * 8usize, + ) + }; + self.cvt_from_bytes_mask64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + self.slide_mask64x2::(a, b) + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { vandq_s64(a.into(), b.into()).simd_into(self) } } @@ -1945,6 +2261,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_f32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_f32x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 4usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) @@ -2255,6 +2621,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + if SHIFT >= 32usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_i8x32(a).val.0; + let b_bytes = self.cvt_to_bytes_i8x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), + ) + } + #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); @@ -2480,6 +2896,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + if SHIFT >= 32usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_u8x32(a).val.0; + let b_bytes = self.cvt_to_bytes_u8x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), + ) + } + #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); @@ -2700,6 +3166,60 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + if SHIFT >= 32usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask8x32(a).val.0; + let b_bytes = self.cvt_to_bytes_mask8x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16( + self.slide_within_blocks_mask8x16::(a0, b0), + self.slide_within_blocks_mask8x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -2835,6 +3355,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_i16x16(a).val.0; + let b_bytes = self.cvt_to_bytes_i16x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 2usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), + ) + } + #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); @@ -3060,6 +3630,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_u16x16(a).val.0; + let b_bytes = self.cvt_to_bytes_u16x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 2usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), + ) + } + #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); @@ -3289,6 +3909,60 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask16x16(a).val.0; + let b_bytes = self.cvt_to_bytes_mask16x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 2usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8( + self.slide_within_blocks_mask16x8::(a0, b0), + self.slide_within_blocks_mask16x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -3424,6 +4098,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_i32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_i32x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 4usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), + ) + } + #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); @@ -3654,6 +4378,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_u32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_u32x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 4usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), + ) + } + #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); @@ -3871,6 +4645,60 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask32x8(a).val.0; + let b_bytes = self.cvt_to_bytes_mask32x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 4usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4( + self.slide_within_blocks_mask32x4::(a0, b0), + self.slide_within_blocks_mask32x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -4006,6 +4834,56 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + if SHIFT >= 4usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_f64x4(a).val.0; + let b_bytes = self.cvt_to_bytes_f64x4(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 8usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) @@ -4269,6 +5147,60 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + if SHIFT >= 4usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask64x4(a).val.0; + let b_bytes = self.cvt_to_bytes_mask64x4(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1]; + let b_blocks = [b_bytes.0, b_bytes.1]; + let shift_bytes = SHIFT * 8usize; + uint8x16x2_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2( + self.slide_within_blocks_mask64x2::(a0, b0), + self.slide_within_blocks_mask64x2::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -4404,6 +5336,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_f32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_f32x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 4usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) @@ -4713,6 +5713,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + if SHIFT >= 64usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_i8x64(a).val.0; + let b_bytes = self.cvt_to_bytes_i8x64(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); @@ -4929,6 +5997,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + if SHIFT >= 64usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_u8x64(a).val.0; + let b_bytes = self.cvt_to_bytes_u8x64(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); @@ -5143,6 +6279,78 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + if SHIFT >= 64usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask8x64(a).val.0; + let b_bytes = self.cvt_to_bytes_mask8x64(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32( + self.slide_within_blocks_mask8x32::(a0, b0), + self.slide_within_blocks_mask8x32::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -5269,6 +6477,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + if SHIFT >= 32usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_i16x32(a).val.0; + let b_bytes = self.cvt_to_bytes_i16x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 2usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), + ) + } + #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); @@ -5494,6 +6770,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + if SHIFT >= 32usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_u16x32(a).val.0; + let b_bytes = self.cvt_to_bytes_u16x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 2usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); @@ -5727,6 +7071,78 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + if SHIFT >= 32usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask16x32(a).val.0; + let b_bytes = self.cvt_to_bytes_mask16x32(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 2usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.slide_within_blocks_mask16x16::(a0, b0), + self.slide_within_blocks_mask16x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -5856,6 +7272,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_i32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_i32x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 4usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); @@ -6077,6 +7561,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_u32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_u32x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 4usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); @@ -6293,6 +7845,78 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + if SHIFT >= 16usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask32x16(a).val.0; + let b_bytes = self.cvt_to_bytes_mask32x16(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 4usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8( + self.slide_within_blocks_mask32x8::(a0, b0), + self.slide_within_blocks_mask32x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -6419,6 +8043,74 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_f64x8(a).val.0; + let b_bytes = self.cvt_to_bytes_f64x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 8usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) @@ -6673,6 +8365,78 @@ impl Simd for Neon { } } #[inline(always)] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + if SHIFT >= 8usize { + return b; + } + let result = unsafe { + let a_bytes = self.cvt_to_bytes_mask64x8(a).val.0; + let b_bytes = self.cvt_to_bytes_mask64x8(b).val.0; + let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; + let b_blocks = [b_bytes.0, b_bytes.1, b_bytes.2, b_bytes.3]; + let shift_bytes = SHIFT * 8usize; + uint8x16x4_t( + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 0, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 1, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 2, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at( + &a_blocks, + &b_blocks, + 3, + shift_bytes, + ); + dyn_vext_128(lo, hi, shift_bytes % 16) + }, + ) + }; + self.cvt_from_bytes_mask64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + #[inline(always)] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4( + self.slide_within_blocks_mask64x4::(a0, b0), + self.slide_within_blocks_mask64x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); @@ -7290,3 +9054,30 @@ impl From> for int64x2x4_t { unsafe { core::mem::transmute_copy(&value.val) } } } +#[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { + unsafe { + match shift { + 0usize => vextq_u8::<0i32>(a, b), + 1usize => vextq_u8::<1i32>(a, b), + 2usize => vextq_u8::<2i32>(a, b), + 3usize => vextq_u8::<3i32>(a, b), + 4usize => vextq_u8::<4i32>(a, b), + 5usize => vextq_u8::<5i32>(a, b), + 6usize => vextq_u8::<6i32>(a, b), + 7usize => vextq_u8::<7i32>(a, b), + 8usize => vextq_u8::<8i32>(a, b), + 9usize => vextq_u8::<9i32>(a, b), + 10usize => vextq_u8::<10i32>(a, b), + 11usize => vextq_u8::<11i32>(a, b), + 12usize => vextq_u8::<12i32>(a, b), + 13usize => vextq_u8::<13i32>(a, b), + 14usize => vextq_u8::<14i32>(a, b), + 15usize => vextq_u8::<15i32>(a, b), + _ => unreachable!(), + } + } +} diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index cba1593b..bad2db0e 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -136,6 +136,14 @@ pub trait Simd: fn cvt_from_bytes_f32x4(self, a: u8x16) -> f32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f32x4(self, a: f32x4) -> u8x16; + #[doc = ""] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4; + #[doc = ""] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4; #[doc = "Compute the absolute value of each element."] fn abs_f32x4(self, a: f32x4) -> f32x4; #[doc = "Negate each element of the vector."] @@ -228,6 +236,14 @@ pub trait Simd: fn cvt_from_bytes_i8x16(self, a: u8x16) -> i8x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i8x16(self, a: i8x16) -> u8x16; + #[doc = ""] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16; + #[doc = ""] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -298,6 +314,14 @@ pub trait Simd: fn cvt_from_bytes_u8x16(self, a: u8x16) -> u8x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u8x16(self, a: u8x16) -> u8x16; + #[doc = ""] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16; + #[doc = ""] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -366,6 +390,18 @@ pub trait Simd: fn cvt_from_bytes_mask8x16(self, a: u8x16) -> mask8x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask8x16(self, a: mask8x16) -> u8x16; + #[doc = ""] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16; + #[doc = ""] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16; #[doc = "Compute the logical AND of two masks."] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16; #[doc = "Compute the logical OR of two masks."] @@ -409,6 +445,14 @@ pub trait Simd: fn cvt_from_bytes_i16x8(self, a: u8x16) -> i16x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i16x8(self, a: i16x8) -> u8x16; + #[doc = ""] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8; + #[doc = ""] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -479,6 +523,14 @@ pub trait Simd: fn cvt_from_bytes_u16x8(self, a: u8x16) -> u16x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u16x8(self, a: u16x8) -> u8x16; + #[doc = ""] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8; + #[doc = ""] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -547,6 +599,18 @@ pub trait Simd: fn cvt_from_bytes_mask16x8(self, a: u8x16) -> mask16x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask16x8(self, a: mask16x8) -> u8x16; + #[doc = ""] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8; + #[doc = ""] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8; #[doc = "Compute the logical AND of two masks."] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8; #[doc = "Compute the logical OR of two masks."] @@ -590,6 +654,14 @@ pub trait Simd: fn cvt_from_bytes_i32x4(self, a: u8x16) -> i32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i32x4(self, a: i32x4) -> u8x16; + #[doc = ""] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4; + #[doc = ""] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -662,6 +734,14 @@ pub trait Simd: fn cvt_from_bytes_u32x4(self, a: u8x16) -> u32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u32x4(self, a: u32x4) -> u8x16; + #[doc = ""] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4; + #[doc = ""] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -730,6 +810,18 @@ pub trait Simd: fn cvt_from_bytes_mask32x4(self, a: u8x16) -> mask32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask32x4(self, a: mask32x4) -> u8x16; + #[doc = ""] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4; + #[doc = ""] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4; #[doc = "Compute the logical AND of two masks."] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4; #[doc = "Compute the logical OR of two masks."] @@ -773,6 +865,14 @@ pub trait Simd: fn cvt_from_bytes_f64x2(self, a: u8x16) -> f64x2; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f64x2(self, a: f64x2) -> u8x16; + #[doc = ""] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2; + #[doc = ""] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2; #[doc = "Compute the absolute value of each element."] fn abs_f64x2(self, a: f64x2) -> f64x2; #[doc = "Negate each element of the vector."] @@ -851,6 +951,18 @@ pub trait Simd: fn cvt_from_bytes_mask64x2(self, a: u8x16) -> mask64x2; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask64x2(self, a: mask64x2) -> u8x16; + #[doc = ""] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2; + #[doc = ""] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2; #[doc = "Compute the logical AND of two masks."] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2; #[doc = "Compute the logical OR of two masks."] @@ -894,6 +1006,14 @@ pub trait Simd: fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32; + #[doc = ""] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8; + #[doc = ""] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8; #[doc = "Compute the absolute value of each element."] fn abs_f32x8(self, a: f32x8) -> f32x8; #[doc = "Negate each element of the vector."] @@ -988,6 +1108,14 @@ pub trait Simd: fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32; + #[doc = ""] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32; + #[doc = ""] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1060,6 +1188,14 @@ pub trait Simd: fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32; + #[doc = ""] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32; + #[doc = ""] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1130,6 +1266,18 @@ pub trait Simd: fn cvt_from_bytes_mask8x32(self, a: u8x32) -> mask8x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask8x32(self, a: mask8x32) -> u8x32; + #[doc = ""] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32; + #[doc = ""] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32; #[doc = "Compute the logical AND of two masks."] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32; #[doc = "Compute the logical OR of two masks."] @@ -1175,6 +1323,14 @@ pub trait Simd: fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32; + #[doc = ""] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16; + #[doc = ""] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1247,6 +1403,14 @@ pub trait Simd: fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32; + #[doc = ""] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16; + #[doc = ""] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1319,6 +1483,18 @@ pub trait Simd: fn cvt_from_bytes_mask16x16(self, a: u8x32) -> mask16x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask16x16(self, a: mask16x16) -> u8x32; + #[doc = ""] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16; + #[doc = ""] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16; #[doc = "Compute the logical AND of two masks."] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16; #[doc = "Compute the logical OR of two masks."] @@ -1364,6 +1540,14 @@ pub trait Simd: fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32; + #[doc = ""] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8; + #[doc = ""] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1438,6 +1622,14 @@ pub trait Simd: fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32; + #[doc = ""] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8; + #[doc = ""] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1508,6 +1700,18 @@ pub trait Simd: fn cvt_from_bytes_mask32x8(self, a: u8x32) -> mask32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask32x8(self, a: mask32x8) -> u8x32; + #[doc = ""] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8; + #[doc = ""] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8; #[doc = "Compute the logical AND of two masks."] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8; #[doc = "Compute the logical OR of two masks."] @@ -1553,6 +1757,14 @@ pub trait Simd: fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32; + #[doc = ""] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4; + #[doc = ""] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4; #[doc = "Compute the absolute value of each element."] fn abs_f64x4(self, a: f64x4) -> f64x4; #[doc = "Negate each element of the vector."] @@ -1633,6 +1845,18 @@ pub trait Simd: fn cvt_from_bytes_mask64x4(self, a: u8x32) -> mask64x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask64x4(self, a: mask64x4) -> u8x32; + #[doc = ""] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4; + #[doc = ""] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4; #[doc = "Compute the logical AND of two masks."] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4; #[doc = "Compute the logical OR of two masks."] @@ -1678,6 +1902,14 @@ pub trait Simd: fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64; + #[doc = ""] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16; + #[doc = ""] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16; #[doc = "Compute the absolute value of each element."] fn abs_f32x16(self, a: f32x16) -> f32x16; #[doc = "Negate each element of the vector."] @@ -1774,6 +2006,14 @@ pub trait Simd: fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64; + #[doc = ""] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64; + #[doc = ""] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1844,6 +2084,14 @@ pub trait Simd: fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64; + #[doc = ""] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64; + #[doc = ""] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -1914,6 +2162,18 @@ pub trait Simd: fn cvt_from_bytes_mask8x64(self, a: u8x64) -> mask8x64; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask8x64(self, a: mask8x64) -> u8x64; + #[doc = ""] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64; + #[doc = ""] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64; #[doc = "Compute the logical AND of two masks."] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64; #[doc = "Compute the logical OR of two masks."] @@ -1957,6 +2217,14 @@ pub trait Simd: fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64; + #[doc = ""] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32; + #[doc = ""] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -2027,6 +2295,14 @@ pub trait Simd: fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64; + #[doc = ""] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32; + #[doc = ""] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -2101,6 +2377,18 @@ pub trait Simd: fn cvt_from_bytes_mask16x32(self, a: u8x64) -> mask16x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask16x32(self, a: mask16x32) -> u8x64; + #[doc = ""] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32; + #[doc = ""] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32; #[doc = "Compute the logical AND of two masks."] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32; #[doc = "Compute the logical OR of two masks."] @@ -2144,6 +2432,14 @@ pub trait Simd: fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64; + #[doc = ""] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16; + #[doc = ""] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -2216,6 +2512,14 @@ pub trait Simd: fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64; + #[doc = ""] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16; + #[doc = ""] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16; #[doc = "Add two vectors element-wise, wrapping on overflow."] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16; #[doc = "Subtract two vectors element-wise, wrapping on overflow."] @@ -2288,6 +2592,18 @@ pub trait Simd: fn cvt_from_bytes_mask32x16(self, a: u8x64) -> mask32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask32x16(self, a: mask32x16) -> u8x64; + #[doc = ""] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16; + #[doc = ""] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16; #[doc = "Compute the logical AND of two masks."] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16; #[doc = "Compute the logical OR of two masks."] @@ -2331,6 +2647,14 @@ pub trait Simd: fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64; + #[doc = ""] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8; + #[doc = ""] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8; #[doc = "Compute the absolute value of each element."] fn abs_f64x8(self, a: f64x8) -> f64x8; #[doc = "Negate each element of the vector."] @@ -2409,6 +2733,18 @@ pub trait Simd: fn cvt_from_bytes_mask64x8(self, a: u8x64) -> mask64x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask64x8(self, a: mask64x8) -> u8x64; + #[doc = ""] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8; + #[doc = ""] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8; #[doc = "Compute the logical AND of two masks."] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8; #[doc = "Compute the logical OR of two masks."] @@ -2532,6 +2868,27 @@ pub trait SimdBase: #[doc = r" calling `f` with that element's lane index (from 0 to"] #[doc = r" [`SimdBase::N`] - 1)."] fn from_fn(simd: S, f: impl FnMut(usize) -> Self::Element) -> Self; + #[doc = r" Concatenate `[self, rhs]` and extract `Self::N` elements"] + #[doc = r" starting at index `SHIFT`."] + #[doc = r""] + #[doc = r" `SHIFT` must be within [0, `Self::N`]."] + #[doc = r""] + #[doc = r#" This can be used to implement a "shift items" operation by"#] + #[doc = r" providing all zeroes as one operand. For a left shift, the"] + #[doc = r" right-hand side should be all zeroes. For a right shift by `M`"] + #[doc = r" items, the left-hand side should be all zeroes, and the shift"] + #[doc = r" amount will be `Self::N - M`."] + #[doc = r""] + #[doc = r" This can also be used to rotate items within a vector by"] + #[doc = r" providing the same vector as both operands."] + #[doc = r""] + #[doc = r" ```text"] + #[doc = r" slide::<1>([a b c d], [e f g h]) == [b c d e]"] + #[doc = r" ```"] + fn slide(self, rhs: impl SimdInto) -> Self; + #[doc = r" Like [`slide`](SimdBase::slide), but operates independently on"] + #[doc = r" each 128-bit block."] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self; } #[doc = r" Functionality implemented by floating-point SIMD vectors."] pub trait SimdFloat: diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs index e9f37172..6d63ac8b 100644 --- a/fearless_simd/src/generated/simd_types.rs +++ b/fearless_simd/src/generated/simd_types.rs @@ -111,6 +111,16 @@ impl SimdBase for f32x4 { fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self { simd.load_array_f32x4(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_f32x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_f32x4::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdFloat for f32x4 { #[inline(always)] @@ -336,6 +346,16 @@ impl SimdBase for i8x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { simd.load_array_i8x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i8x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i8x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i8x16 { #[inline(always)] @@ -497,6 +517,16 @@ impl SimdBase for u8x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self { simd.load_array_u8x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u8x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u8x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u8x16 { #[inline(always)] @@ -663,6 +693,16 @@ impl SimdBase for mask8x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { simd.load_array_mask8x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask8x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask8x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask8x16 { #[inline(always)] @@ -800,6 +840,16 @@ impl SimdBase for i16x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { simd.load_array_i16x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i16x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i16x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i16x8 { #[inline(always)] @@ -961,6 +1011,16 @@ impl SimdBase for u16x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self { simd.load_array_u16x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u16x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u16x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u16x8 { #[inline(always)] @@ -1127,6 +1187,16 @@ impl SimdBase for mask16x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { simd.load_array_mask16x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask16x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask16x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask16x8 { #[inline(always)] @@ -1264,6 +1334,16 @@ impl SimdBase for i32x4 { fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { simd.load_array_i32x4(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i32x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i32x4::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i32x4 { #[inline(always)] @@ -1437,6 +1517,16 @@ impl SimdBase for u32x4 { fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self { simd.load_array_u32x4(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u32x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u32x4::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u32x4 { #[inline(always)] @@ -1615,6 +1705,16 @@ impl SimdBase for mask32x4 { fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { simd.load_array_mask32x4(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask32x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask32x4::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask32x4 { #[inline(always)] @@ -1752,6 +1852,16 @@ impl SimdBase for f64x2 { fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self { simd.load_array_f64x2(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_f64x2::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_f64x2::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdFloat for f64x2 { #[inline(always)] @@ -1968,6 +2078,16 @@ impl SimdBase for mask64x2 { fn from_fn(simd: S, f: impl FnMut(usize) -> i64) -> Self { simd.load_array_mask64x2(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask64x2::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask64x2::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask64x2 { #[inline(always)] @@ -2105,6 +2225,16 @@ impl SimdBase for f32x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self { simd.load_array_f32x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_f32x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_f32x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdFloat for f32x8 { #[inline(always)] @@ -2337,6 +2467,16 @@ impl SimdBase for i8x32 { fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { simd.load_array_i8x32(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i8x32::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i8x32::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i8x32 { #[inline(always)] @@ -2505,6 +2645,16 @@ impl SimdBase for u8x32 { fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self { simd.load_array_u8x32(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u8x32::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u8x32::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u8x32 { #[inline(always)] @@ -2678,6 +2828,16 @@ impl SimdBase for mask8x32 { fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { simd.load_array_mask8x32(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask8x32::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask8x32::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask8x32 { #[inline(always)] @@ -2827,6 +2987,16 @@ impl SimdBase for i16x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { simd.load_array_i16x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i16x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i16x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i16x16 { #[inline(always)] @@ -3000,6 +3170,16 @@ impl SimdBase for u16x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self { simd.load_array_u16x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u16x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u16x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u16x16 { #[inline(always)] @@ -3173,6 +3353,16 @@ impl SimdBase for mask16x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { simd.load_array_mask16x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask16x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask16x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask16x16 { #[inline(always)] @@ -3317,6 +3507,16 @@ impl SimdBase for i32x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { simd.load_array_i32x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i32x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i32x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i32x8 { #[inline(always)] @@ -3497,6 +3697,16 @@ impl SimdBase for u32x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self { simd.load_array_u32x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u32x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u32x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u32x8 { #[inline(always)] @@ -3682,6 +3892,16 @@ impl SimdBase for mask32x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { simd.load_array_mask32x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask32x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask32x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask32x8 { #[inline(always)] @@ -3826,6 +4046,16 @@ impl SimdBase for f64x4 { fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self { simd.load_array_f64x4(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_f64x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_f64x4::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdFloat for f64x4 { #[inline(always)] @@ -4049,6 +4279,16 @@ impl SimdBase for mask64x4 { fn from_fn(simd: S, f: impl FnMut(usize) -> i64) -> Self { simd.load_array_mask64x4(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask64x4::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask64x4::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask64x4 { #[inline(always)] @@ -4199,6 +4439,16 @@ impl SimdBase for f32x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> f32) -> Self { simd.load_array_f32x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_f32x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_f32x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdFloat for f32x16 { #[inline(always)] @@ -4425,6 +4675,16 @@ impl SimdBase for i8x64 { fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { simd.load_array_i8x64(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i8x64::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i8x64::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i8x64 { #[inline(always)] @@ -4587,6 +4847,16 @@ impl SimdBase for u8x64 { fn from_fn(simd: S, f: impl FnMut(usize) -> u8) -> Self { simd.load_array_u8x64(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u8x64::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u8x64::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u8x64 { #[inline(always)] @@ -4754,6 +5024,16 @@ impl SimdBase for mask8x64 { fn from_fn(simd: S, f: impl FnMut(usize) -> i8) -> Self { simd.load_array_mask8x64(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask8x64::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask8x64::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask8x64 { #[inline(always)] @@ -4897,6 +5177,16 @@ impl SimdBase for i16x32 { fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { simd.load_array_i16x32(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i16x32::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i16x32::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i16x32 { #[inline(always)] @@ -5064,6 +5354,16 @@ impl SimdBase for u16x32 { fn from_fn(simd: S, f: impl FnMut(usize) -> u16) -> Self { simd.load_array_u16x32(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u16x32::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u16x32::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u16x32 { #[inline(always)] @@ -5231,6 +5531,16 @@ impl SimdBase for mask16x32 { fn from_fn(simd: S, f: impl FnMut(usize) -> i16) -> Self { simd.load_array_mask16x32(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask16x32::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask16x32::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask16x32 { #[inline(always)] @@ -5374,6 +5684,16 @@ impl SimdBase for i32x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { simd.load_array_i32x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_i32x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_i32x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for i32x16 { #[inline(always)] @@ -5553,6 +5873,16 @@ impl SimdBase for u32x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> u32) -> Self { simd.load_array_u32x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_u32x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_u32x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdInt for u32x16 { #[inline(always)] @@ -5732,6 +6062,16 @@ impl SimdBase for mask32x16 { fn from_fn(simd: S, f: impl FnMut(usize) -> i32) -> Self { simd.load_array_mask32x16(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask32x16::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask32x16::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask32x16 { #[inline(always)] @@ -5870,6 +6210,16 @@ impl SimdBase for f64x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> f64) -> Self { simd.load_array_f64x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_f64x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_f64x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdFloat for f64x8 { #[inline(always)] @@ -6087,6 +6437,16 @@ impl SimdBase for mask64x8 { fn from_fn(simd: S, f: impl FnMut(usize) -> i64) -> Self { simd.load_array_mask64x8(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_mask64x8::(self, rhs.simd_into(self.simd)) + } + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd + .slide_within_blocks_mask64x8::(self, rhs.simd_into(self.simd)) + } } impl crate::SimdMask for mask64x8 { #[inline(always)] diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 58843ad5..88ac20b8 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -151,6 +151,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f32x4(b).val.0, + self.cvt_to_bytes_f32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { unsafe { _mm_andnot_ps(_mm_set1_ps(-0.0), a.into()).simd_into(self) } } @@ -417,6 +442,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i8x16(b).val.0, + self.cvt_to_bytes_i8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } @@ -611,6 +661,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u8x16(b).val.0, + self.cvt_to_bytes_u8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { unsafe { _mm_add_epi8(a.into(), b.into()).simd_into(self) } } @@ -816,6 +891,35 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask8x16(b).val.0, + self.cvt_to_bytes_mask8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + self.slide_mask8x16::(a, b) + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -916,6 +1020,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i16x8(b).val.0, + self.cvt_to_bytes_i16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } @@ -1085,6 +1214,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u16x8(b).val.0, + self.cvt_to_bytes_u16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { unsafe { _mm_add_epi16(a.into(), b.into()).simd_into(self) } } @@ -1260,6 +1414,35 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask16x8(b).val.0, + self.cvt_to_bytes_mask16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + self.slide_mask16x8::(a, b) + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1360,6 +1543,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_i32x4(b).val.0, + self.cvt_to_bytes_i32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } @@ -1531,6 +1739,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_u32x4(b).val.0, + self.cvt_to_bytes_u32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { unsafe { _mm_add_epi32(a.into(), b.into()).simd_into(self) } } @@ -1714,6 +1947,35 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask32x4(b).val.0, + self.cvt_to_bytes_mask32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + self.slide_mask32x4::(a, b) + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -1814,6 +2076,31 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_f64x2(b).val.0, + self.cvt_to_bytes_f64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { unsafe { _mm_andnot_pd(_mm_set1_pd(-0.0), a.into()).simd_into(self) } } @@ -2009,6 +2296,35 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + unsafe { + if SHIFT >= 2usize { + return b; + } + let result = dyn_alignr_128( + self.cvt_to_bytes_mask64x2(b).val.0, + self.cvt_to_bytes_mask64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + self.slide_mask64x2::(a, b) + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) } } @@ -2120,6 +2436,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) @@ -2438,6 +2784,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), + ) + } + #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); @@ -2671,6 +3047,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), + ) + } + #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); @@ -2899,6 +3305,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_mask8x32(b).val.0, + self.cvt_to_bytes_mask8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16( + self.slide_within_blocks_mask8x16::(a0, b0), + self.slide_within_blocks_mask8x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -3042,6 +3482,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), + ) + } + #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); @@ -3275,6 +3745,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), + ) + } + #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); @@ -3514,6 +4014,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_mask16x16(b).val.0, + self.cvt_to_bytes_mask16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8( + self.slide_within_blocks_mask16x8::(a0, b0), + self.slide_within_blocks_mask16x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -3657,6 +4191,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), + ) + } + #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); @@ -3895,6 +4459,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), + ) + } + #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); @@ -4120,6 +4714,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_mask32x8(b).val.0, + self.cvt_to_bytes_mask32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4( + self.slide_within_blocks_mask32x4::(a0, b0), + self.slide_within_blocks_mask32x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -4263,6 +4891,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) @@ -4534,6 +5192,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + unsafe { + if SHIFT >= 4usize { + return b; + } + let result = cross_block_alignr_128x2( + self.cvt_to_bytes_mask64x4(b).val.0, + self.cvt_to_bytes_mask64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2( + self.slide_within_blocks_mask64x2::(a0, b0), + self.slide_within_blocks_mask64x2::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -4681,6 +5373,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) @@ -5041,6 +5763,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); @@ -5271,6 +6023,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); @@ -5546,6 +6328,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + unsafe { + if SHIFT >= 64usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_mask8x64(b).val.0, + self.cvt_to_bytes_mask8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32( + self.slide_within_blocks_mask8x32::(a0, b0), + self.slide_within_blocks_mask8x32::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -5686,6 +6502,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), + ) + } + #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); @@ -5925,6 +6771,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); @@ -6219,6 +7095,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + unsafe { + if SHIFT >= 32usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_mask16x32(b).val.0, + self.cvt_to_bytes_mask16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.slide_within_blocks_mask16x16::(a0, b0), + self.slide_within_blocks_mask16x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -6362,6 +7272,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); @@ -6597,6 +7537,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); @@ -6864,6 +7834,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + unsafe { + if SHIFT >= 16usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_mask32x16(b).val.0, + self.cvt_to_bytes_mask32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8( + self.slide_within_blocks_mask32x8::(a0, b0), + self.slide_within_blocks_mask32x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -7004,6 +8008,36 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) @@ -7272,6 +8306,40 @@ impl Simd for Sse4_2 { } } #[inline(always)] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + unsafe { + if SHIFT >= 8usize { + return b; + } + let result = cross_block_alignr_128x4( + self.cvt_to_bytes_mask64x8(b).val.0, + self.cvt_to_bytes_mask64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4( + self.slide_within_blocks_mask64x4::(a0, b0), + self.slide_within_blocks_mask64x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); @@ -7529,3 +8597,76 @@ impl From> for __m128i { unsafe { core::mem::transmute_copy(&value.val) } } } +#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] +#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] +#[doc = r" Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { + unsafe { + match shift { + 0usize => _mm_alignr_epi8::<0i32>(a, b), + 1usize => _mm_alignr_epi8::<1i32>(a, b), + 2usize => _mm_alignr_epi8::<2i32>(a, b), + 3usize => _mm_alignr_epi8::<3i32>(a, b), + 4usize => _mm_alignr_epi8::<4i32>(a, b), + 5usize => _mm_alignr_epi8::<5i32>(a, b), + 6usize => _mm_alignr_epi8::<6i32>(a, b), + 7usize => _mm_alignr_epi8::<7i32>(a, b), + 8usize => _mm_alignr_epi8::<8i32>(a, b), + 9usize => _mm_alignr_epi8::<9i32>(a, b), + 10usize => _mm_alignr_epi8::<10i32>(a, b), + 11usize => _mm_alignr_epi8::<11i32>(a, b), + 12usize => _mm_alignr_epi8::<12i32>(a, b), + 13usize => _mm_alignr_epi8::<13i32>(a, b), + 14usize => _mm_alignr_epi8::<14i32>(a, b), + 15usize => _mm_alignr_epi8::<15i32>(a, b), + _ => unreachable!(), + } + } +} +#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] +#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."] +#[inline(always)] +unsafe fn cross_block_alignr_128x2( + a: [__m128i; 2usize], + b: [__m128i; 2usize], + shift_bytes: usize, +) -> [__m128i; 2usize] { + [ + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }, + ] +} +#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] +#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."] +#[inline(always)] +unsafe fn cross_block_alignr_128x4( + a: [__m128i; 4usize], + b: [__m128i; 4usize], + shift_bytes: usize, +) -> [__m128i; 4usize] { + [ + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }, + ] +} diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 2c76d0fc..ef4b0ed6 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -133,6 +133,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { + if SHIFT >= 4usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_f32x4(a).val.0, + self.cvt_to_bytes_f32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x4( + self, + a: f32x4, + b: f32x4, + ) -> f32x4 { + self.slide_f32x4::(a, b) + } + #[inline(always)] fn abs_f32x4(self, a: f32x4) -> f32x4 { f32x4_abs(a.into()).simd_into(self) } @@ -392,6 +417,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_i8x16(a).val.0, + self.cvt_to_bytes_i8x16(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x16( + self, + a: i8x16, + b: i8x16, + ) -> i8x16 { + self.slide_i8x16::(a, b) + } + #[inline(always)] fn add_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { i8x16_add(a.into(), b.into()).simd_into(self) } @@ -574,6 +624,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_u8x16(a).val.0, + self.cvt_to_bytes_u8x16(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x16( + self, + a: u8x16, + b: u8x16, + ) -> u8x16 { + self.slide_u8x16::(a, b) + } + #[inline(always)] fn add_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { u8x16_add(a.into(), b.into()).simd_into(self) } @@ -754,6 +829,35 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_mask8x16(a).val.0, + self.cvt_to_bytes_mask8x16(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x16( + self, + a: mask8x16, + b: mask8x16, + ) -> mask8x16 { + self.slide_mask8x16::(a, b) + } + #[inline(always)] fn and_mask8x16(self, a: mask8x16, b: mask8x16) -> mask8x16 { v128_and(a.into(), b.into()).simd_into(self) } @@ -861,6 +965,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_i16x8(a).val.0, + self.cvt_to_bytes_i16x8(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x8( + self, + a: i16x8, + b: i16x8, + ) -> i16x8 { + self.slide_i16x8::(a, b) + } + #[inline(always)] fn add_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { i16x8_add(a.into(), b.into()).simd_into(self) } @@ -1027,6 +1156,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_u16x8(a).val.0, + self.cvt_to_bytes_u16x8(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x8( + self, + a: u16x8, + b: u16x8, + ) -> u16x8 { + self.slide_u16x8::(a, b) + } + #[inline(always)] fn add_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { u16x8_add(a.into(), b.into()).simd_into(self) } @@ -1189,6 +1343,35 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_mask16x8(a).val.0, + self.cvt_to_bytes_mask16x8(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x8( + self, + a: mask16x8, + b: mask16x8, + ) -> mask16x8 { + self.slide_mask16x8::(a, b) + } + #[inline(always)] fn and_mask16x8(self, a: mask16x8, b: mask16x8) -> mask16x8 { v128_and(a.into(), b.into()).simd_into(self) } @@ -1296,6 +1479,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { + if SHIFT >= 4usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_i32x4(a).val.0, + self.cvt_to_bytes_i32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x4( + self, + a: i32x4, + b: i32x4, + ) -> i32x4 { + self.slide_i32x4::(a, b) + } + #[inline(always)] fn add_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { i32x4_add(a.into(), b.into()).simd_into(self) } @@ -1466,6 +1674,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { + if SHIFT >= 4usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_u32x4(a).val.0, + self.cvt_to_bytes_u32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x4( + self, + a: u32x4, + b: u32x4, + ) -> u32x4 { + self.slide_u32x4::(a, b) + } + #[inline(always)] fn add_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { u32x4_add(a.into(), b.into()).simd_into(self) } @@ -1628,6 +1861,35 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + if SHIFT >= 4usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_mask32x4(a).val.0, + self.cvt_to_bytes_mask32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x4( + self, + a: mask32x4, + b: mask32x4, + ) -> mask32x4 { + self.slide_mask32x4::(a, b) + } + #[inline(always)] fn and_mask32x4(self, a: mask32x4, b: mask32x4) -> mask32x4 { v128_and(a.into(), b.into()).simd_into(self) } @@ -1735,6 +1997,31 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { + if SHIFT >= 2usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_f64x2(a).val.0, + self.cvt_to_bytes_f64x2(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x2( + self, + a: f64x2, + b: f64x2, + ) -> f64x2 { + self.slide_f64x2::(a, b) + } + #[inline(always)] fn abs_f64x2(self, a: f64x2) -> f64x2 { f64x2_abs(a.into()).simd_into(self) } @@ -1952,6 +2239,35 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + if SHIFT >= 2usize { + return b; + } + unsafe { + let result = dyn_slide_128( + self.cvt_to_bytes_mask64x2(a).val.0, + self.cvt_to_bytes_mask64x2(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x2( + self, + a: mask64x2, + b: mask64x2, + ) -> mask64x2 { + self.slide_mask64x2::(a, b) + } + #[inline(always)] fn and_mask64x2(self, a: mask64x2, b: mask64x2) -> mask64x2 { v128_and(a.into(), b.into()).simd_into(self) } @@ -2070,6 +2386,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_f32x8(a).val.0, + self.cvt_to_bytes_f32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x8( + self, + a: f32x8, + b: f32x8, + ) -> f32x8 { + let (a0, a1) = self.split_f32x8(a); + let (b0, b1) = self.split_f32x8(b); + self.combine_f32x4( + self.slide_within_blocks_f32x4::(a0, b0), + self.slide_within_blocks_f32x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { let (a0, a1) = self.split_f32x8(a); self.combine_f32x4(self.abs_f32x4(a0), self.abs_f32x4(a1)) @@ -2388,6 +2734,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { + if SHIFT >= 32usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i8x32(a).val.0, + self.cvt_to_bytes_i8x32(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x32( + self, + a: i8x32, + b: i8x32, + ) -> i8x32 { + let (a0, a1) = self.split_i8x32(a); + let (b0, b1) = self.split_i8x32(b); + self.combine_i8x16( + self.slide_within_blocks_i8x16::(a0, b0), + self.slide_within_blocks_i8x16::(a1, b1), + ) + } + #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { let (a0, a1) = self.split_i8x32(a); let (b0, b1) = self.split_i8x32(b); @@ -2621,6 +2997,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { + if SHIFT >= 32usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u8x32(a).val.0, + self.cvt_to_bytes_u8x32(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x32( + self, + a: u8x32, + b: u8x32, + ) -> u8x32 { + let (a0, a1) = self.split_u8x32(a); + let (b0, b1) = self.split_u8x32(b); + self.combine_u8x16( + self.slide_within_blocks_u8x16::(a0, b0), + self.slide_within_blocks_u8x16::(a1, b1), + ) + } + #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { let (a0, a1) = self.split_u8x32(a); let (b0, b1) = self.split_u8x32(b); @@ -2849,6 +3255,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + if SHIFT >= 32usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_mask8x32(a).val.0, + self.cvt_to_bytes_mask8x32(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x32( + self, + a: mask8x32, + b: mask8x32, + ) -> mask8x32 { + let (a0, a1) = self.split_mask8x32(a); + let (b0, b1) = self.split_mask8x32(b); + self.combine_mask8x16( + self.slide_within_blocks_mask8x16::(a0, b0), + self.slide_within_blocks_mask8x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x32(self, a: mask8x32, b: mask8x32) -> mask8x32 { let (a0, a1) = self.split_mask8x32(a); let (b0, b1) = self.split_mask8x32(b); @@ -2992,6 +3432,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i16x16(a).val.0, + self.cvt_to_bytes_i16x16(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x16( + self, + a: i16x16, + b: i16x16, + ) -> i16x16 { + let (a0, a1) = self.split_i16x16(a); + let (b0, b1) = self.split_i16x16(b); + self.combine_i16x8( + self.slide_within_blocks_i16x8::(a0, b0), + self.slide_within_blocks_i16x8::(a1, b1), + ) + } + #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { let (a0, a1) = self.split_i16x16(a); let (b0, b1) = self.split_i16x16(b); @@ -3225,6 +3695,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u16x16(a).val.0, + self.cvt_to_bytes_u16x16(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x16( + self, + a: u16x16, + b: u16x16, + ) -> u16x16 { + let (a0, a1) = self.split_u16x16(a); + let (b0, b1) = self.split_u16x16(b); + self.combine_u16x8( + self.slide_within_blocks_u16x8::(a0, b0), + self.slide_within_blocks_u16x8::(a1, b1), + ) + } + #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { let (a0, a1) = self.split_u16x16(a); let (b0, b1) = self.split_u16x16(b); @@ -3462,6 +3962,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_mask16x16(a).val.0, + self.cvt_to_bytes_mask16x16(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x16( + self, + a: mask16x16, + b: mask16x16, + ) -> mask16x16 { + let (a0, a1) = self.split_mask16x16(a); + let (b0, b1) = self.split_mask16x16(b); + self.combine_mask16x8( + self.slide_within_blocks_mask16x8::(a0, b0), + self.slide_within_blocks_mask16x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x16(self, a: mask16x16, b: mask16x16) -> mask16x16 { let (a0, a1) = self.split_mask16x16(a); let (b0, b1) = self.split_mask16x16(b); @@ -3605,6 +4139,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i32x8(a).val.0, + self.cvt_to_bytes_i32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x8( + self, + a: i32x8, + b: i32x8, + ) -> i32x8 { + let (a0, a1) = self.split_i32x8(a); + let (b0, b1) = self.split_i32x8(b); + self.combine_i32x4( + self.slide_within_blocks_i32x4::(a0, b0), + self.slide_within_blocks_i32x4::(a1, b1), + ) + } + #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { let (a0, a1) = self.split_i32x8(a); let (b0, b1) = self.split_i32x8(b); @@ -3843,6 +4407,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u32x8(a).val.0, + self.cvt_to_bytes_u32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x8( + self, + a: u32x8, + b: u32x8, + ) -> u32x8 { + let (a0, a1) = self.split_u32x8(a); + let (b0, b1) = self.split_u32x8(b); + self.combine_u32x4( + self.slide_within_blocks_u32x4::(a0, b0), + self.slide_within_blocks_u32x4::(a1, b1), + ) + } + #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { let (a0, a1) = self.split_u32x8(a); let (b0, b1) = self.split_u32x8(b); @@ -4068,6 +4662,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_mask32x8(a).val.0, + self.cvt_to_bytes_mask32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x8( + self, + a: mask32x8, + b: mask32x8, + ) -> mask32x8 { + let (a0, a1) = self.split_mask32x8(a); + let (b0, b1) = self.split_mask32x8(b); + self.combine_mask32x4( + self.slide_within_blocks_mask32x4::(a0, b0), + self.slide_within_blocks_mask32x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x8(self, a: mask32x8, b: mask32x8) -> mask32x8 { let (a0, a1) = self.split_mask32x8(a); let (b0, b1) = self.split_mask32x8(b); @@ -4211,6 +4839,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { + if SHIFT >= 4usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_f64x4(a).val.0, + self.cvt_to_bytes_f64x4(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x4( + self, + a: f64x4, + b: f64x4, + ) -> f64x4 { + let (a0, a1) = self.split_f64x4(a); + let (b0, b1) = self.split_f64x4(b); + self.combine_f64x2( + self.slide_within_blocks_f64x2::(a0, b0), + self.slide_within_blocks_f64x2::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { let (a0, a1) = self.split_f64x4(a); self.combine_f64x2(self.abs_f64x2(a0), self.abs_f64x2(a1)) @@ -4482,6 +5140,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + if SHIFT >= 4usize { + return b; + } + unsafe { + let result = cross_block_slide_128x2( + self.cvt_to_bytes_mask64x4(a).val.0, + self.cvt_to_bytes_mask64x4(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x4( + self, + a: mask64x4, + b: mask64x4, + ) -> mask64x4 { + let (a0, a1) = self.split_mask64x4(a); + let (b0, b1) = self.split_mask64x4(b); + self.combine_mask64x2( + self.slide_within_blocks_mask64x2::(a0, b0), + self.slide_within_blocks_mask64x2::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x4(self, a: mask64x4, b: mask64x4) -> mask64x4 { let (a0, a1) = self.split_mask64x4(a); let (b0, b1) = self.split_mask64x4(b); @@ -4629,6 +5321,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_f32x16(a).val.0, + self.cvt_to_bytes_f32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f32x16( + self, + a: f32x16, + b: f32x16, + ) -> f32x16 { + let (a0, a1) = self.split_f32x16(a); + let (b0, b1) = self.split_f32x16(b); + self.combine_f32x8( + self.slide_within_blocks_f32x8::(a0, b0), + self.slide_within_blocks_f32x8::(a1, b1), + ) + } + #[inline(always)] fn abs_f32x16(self, a: f32x16) -> f32x16 { let (a0, a1) = self.split_f32x16(a); self.combine_f32x8(self.abs_f32x8(a0), self.abs_f32x8(a1)) @@ -4986,6 +5708,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { + if SHIFT >= 64usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i8x64(a).val.0, + self.cvt_to_bytes_i8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i8x64( + self, + a: i8x64, + b: i8x64, + ) -> i8x64 { + let (a0, a1) = self.split_i8x64(a); + let (b0, b1) = self.split_i8x64(b); + self.combine_i8x32( + self.slide_within_blocks_i8x32::(a0, b0), + self.slide_within_blocks_i8x32::(a1, b1), + ) + } + #[inline(always)] fn add_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { let (a0, a1) = self.split_i8x64(a); let (b0, b1) = self.split_i8x64(b); @@ -5216,6 +5968,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { + if SHIFT >= 64usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u8x64(a).val.0, + self.cvt_to_bytes_u8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u8x64( + self, + a: u8x64, + b: u8x64, + ) -> u8x64 { + let (a0, a1) = self.split_u8x64(a); + let (b0, b1) = self.split_u8x64(b); + self.combine_u8x32( + self.slide_within_blocks_u8x32::(a0, b0), + self.slide_within_blocks_u8x32::(a1, b1), + ) + } + #[inline(always)] fn add_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { let (a0, a1) = self.split_u8x64(a); let (b0, b1) = self.split_u8x64(b); @@ -5502,6 +6284,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + if SHIFT >= 64usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_mask8x64(a).val.0, + self.cvt_to_bytes_mask8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_mask8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask8x64( + self, + a: mask8x64, + b: mask8x64, + ) -> mask8x64 { + let (a0, a1) = self.split_mask8x64(a); + let (b0, b1) = self.split_mask8x64(b); + self.combine_mask8x32( + self.slide_within_blocks_mask8x32::(a0, b0), + self.slide_within_blocks_mask8x32::(a1, b1), + ) + } + #[inline(always)] fn and_mask8x64(self, a: mask8x64, b: mask8x64) -> mask8x64 { let (a0, a1) = self.split_mask8x64(a); let (b0, b1) = self.split_mask8x64(b); @@ -5642,6 +6458,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { + if SHIFT >= 32usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i16x32(a).val.0, + self.cvt_to_bytes_i16x32(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i16x32( + self, + a: i16x32, + b: i16x32, + ) -> i16x32 { + let (a0, a1) = self.split_i16x32(a); + let (b0, b1) = self.split_i16x32(b); + self.combine_i16x16( + self.slide_within_blocks_i16x16::(a0, b0), + self.slide_within_blocks_i16x16::(a1, b1), + ) + } + #[inline(always)] fn add_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { let (a0, a1) = self.split_i16x32(a); let (b0, b1) = self.split_i16x32(b); @@ -5881,6 +6727,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { + if SHIFT >= 32usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u16x32(a).val.0, + self.cvt_to_bytes_u16x32(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u16x32( + self, + a: u16x32, + b: u16x32, + ) -> u16x32 { + let (a0, a1) = self.split_u16x32(a); + let (b0, b1) = self.split_u16x32(b); + self.combine_u16x16( + self.slide_within_blocks_u16x16::(a0, b0), + self.slide_within_blocks_u16x16::(a1, b1), + ) + } + #[inline(always)] fn add_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { let (a0, a1) = self.split_u16x32(a); let (b0, b1) = self.split_u16x32(b); @@ -6162,6 +7038,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + if SHIFT >= 32usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_mask16x32(a).val.0, + self.cvt_to_bytes_mask16x32(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_mask16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask16x32( + self, + a: mask16x32, + b: mask16x32, + ) -> mask16x32 { + let (a0, a1) = self.split_mask16x32(a); + let (b0, b1) = self.split_mask16x32(b); + self.combine_mask16x16( + self.slide_within_blocks_mask16x16::(a0, b0), + self.slide_within_blocks_mask16x16::(a1, b1), + ) + } + #[inline(always)] fn and_mask16x32(self, a: mask16x32, b: mask16x32) -> mask16x32 { let (a0, a1) = self.split_mask16x32(a); let (b0, b1) = self.split_mask16x32(b); @@ -6305,6 +7215,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i32x16(a).val.0, + self.cvt_to_bytes_i32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_i32x16( + self, + a: i32x16, + b: i32x16, + ) -> i32x16 { + let (a0, a1) = self.split_i32x16(a); + let (b0, b1) = self.split_i32x16(b); + self.combine_i32x8( + self.slide_within_blocks_i32x8::(a0, b0), + self.slide_within_blocks_i32x8::(a1, b1), + ) + } + #[inline(always)] fn add_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { let (a0, a1) = self.split_i32x16(a); let (b0, b1) = self.split_i32x16(b); @@ -6540,6 +7480,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u32x16(a).val.0, + self.cvt_to_bytes_u32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_u32x16( + self, + a: u32x16, + b: u32x16, + ) -> u32x16 { + let (a0, a1) = self.split_u32x16(a); + let (b0, b1) = self.split_u32x16(b); + self.combine_u32x8( + self.slide_within_blocks_u32x8::(a0, b0), + self.slide_within_blocks_u32x8::(a1, b1), + ) + } + #[inline(always)] fn add_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { let (a0, a1) = self.split_u32x16(a); let (b0, b1) = self.split_u32x16(b); @@ -6804,6 +7774,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + if SHIFT >= 16usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_mask32x16(a).val.0, + self.cvt_to_bytes_mask32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_mask32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask32x16( + self, + a: mask32x16, + b: mask32x16, + ) -> mask32x16 { + let (a0, a1) = self.split_mask32x16(a); + let (b0, b1) = self.split_mask32x16(b); + self.combine_mask32x8( + self.slide_within_blocks_mask32x8::(a0, b0), + self.slide_within_blocks_mask32x8::(a1, b1), + ) + } + #[inline(always)] fn and_mask32x16(self, a: mask32x16, b: mask32x16) -> mask32x16 { let (a0, a1) = self.split_mask32x16(a); let (b0, b1) = self.split_mask32x16(b); @@ -6944,6 +7948,36 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_f64x8(a).val.0, + self.cvt_to_bytes_f64x8(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_f64x8( + self, + a: f64x8, + b: f64x8, + ) -> f64x8 { + let (a0, a1) = self.split_f64x8(a); + let (b0, b1) = self.split_f64x8(b); + self.combine_f64x4( + self.slide_within_blocks_f64x4::(a0, b0), + self.slide_within_blocks_f64x4::(a1, b1), + ) + } + #[inline(always)] fn abs_f64x8(self, a: f64x8) -> f64x8 { let (a0, a1) = self.split_f64x8(a); self.combine_f64x4(self.abs_f64x4(a0), self.abs_f64x4(a1)) @@ -7212,6 +8246,40 @@ impl Simd for WasmSimd128 { } } #[inline(always)] + fn slide_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + if SHIFT >= 8usize { + return b; + } + unsafe { + let result = cross_block_slide_128x4( + self.cvt_to_bytes_mask64x8(a).val.0, + self.cvt_to_bytes_mask64x8(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_mask64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) + } + } + #[inline(always)] + fn slide_within_blocks_mask64x8( + self, + a: mask64x8, + b: mask64x8, + ) -> mask64x8 { + let (a0, a1) = self.split_mask64x8(a); + let (b0, b1) = self.split_mask64x8(b); + self.combine_mask64x4( + self.slide_within_blocks_mask64x4::(a0, b0), + self.slide_within_blocks_mask64x4::(a1, b1), + ) + } + #[inline(always)] fn and_mask64x8(self, a: mask64x8, b: mask64x8) -> mask64x8 { let (a0, a1) = self.split_mask64x8(a); let (b0, b1) = self.split_mask64x8(b); @@ -7469,3 +8537,90 @@ impl From> for v128 { unsafe { core::mem::transmute_copy(&value.val) } } } +#[doc = r" This is a vector extend, like `vext` on ARM or `alignr` on x86, that takes a non-const shift argument."] +#[doc = r" The shift is still expected to be constant in practice, so the match statement will be optimized out."] +#[doc = r" This exists because Rust doesn't currently let you do math on const generics."] +#[inline(always)] +unsafe fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 { + unsafe { + match shift { + 0 => i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>(a, b), + 1 => i8x16_shuffle::<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16>(a, b), + 2 => i8x16_shuffle::<2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>(a, b), + 3 => i8x16_shuffle::<3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18>(a, b), + 4 => i8x16_shuffle::<4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19>(a, b), + 5 => i8x16_shuffle::<5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20>(a, b), + 6 => i8x16_shuffle::<6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21>(a, b), + 7 => i8x16_shuffle::<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22>(a, b), + 8 => { + i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23>(a, b) + } + 9 => { + i8x16_shuffle::<9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24>(a, b) + } + 10 => i8x16_shuffle::<10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25>( + a, b, + ), + 11 => i8x16_shuffle::<11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26>( + a, b, + ), + 12 => i8x16_shuffle::<12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27>( + a, b, + ), + 13 => i8x16_shuffle::<13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28>( + a, b, + ), + 14 => i8x16_shuffle::<14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29>( + a, b, + ), + 15 => i8x16_shuffle::<15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30>( + a, b, + ), + _ => unreachable!(), + } + } +} +#[doc = r" Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] +#[inline(always)] +unsafe fn cross_block_slide_128x2( + a: [v128; 2usize], + b: [v128; 2usize], + shift_bytes: usize, +) -> [v128; 2usize] { + [ + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 0usize, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 1usize, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + }, + ] +} +#[doc = r" Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] +#[inline(always)] +unsafe fn cross_block_slide_128x4( + a: [v128; 4usize], + b: [v128; 4usize], + shift_bytes: usize, +) -> [v128; 4usize] { + [ + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 0usize, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 1usize, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 2usize, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + }, + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 3usize, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + }, + ] +} diff --git a/fearless_simd/src/support.rs b/fearless_simd/src/support.rs index d6db4ae1..2c298326 100644 --- a/fearless_simd/src/support.rs +++ b/fearless_simd/src/support.rs @@ -42,3 +42,24 @@ pub(crate) fn simd_debug_impl( .field("simd", token) .finish() } + +/// Selects the input operands to be used for `slignr`/`vext`/etc. when computing a single output block for cross-block +/// "slide" operations. Extracts from [a : b]. +#[inline(always)] +#[allow(clippy::allow_attributes, reason = "Only needed in some cfgs.")] +#[allow(dead_code, reason = "Only used in some cfgs.")] +pub(crate) fn cross_block_slide_blocks_at( + a: &[Block; N], + b: &[Block; N], + out_idx: usize, + shift_bytes: usize, +) -> [Block; 2] { + const BLOCK_BYTES: usize = 16; + let out_byte_start = out_idx * BLOCK_BYTES + shift_bytes; + let lo_idx = out_byte_start.div_euclid(BLOCK_BYTES); + let hi_idx = lo_idx + 1; + // Concatenation is [a : b], so indices 0..N are from a, indices N..2N are from b + let lo_block = if lo_idx < N { a[lo_idx] } else { b[lo_idx - N] }; + let hi_block = if hi_idx < N { a[hi_idx] } else { b[hi_idx - N] }; + [lo_block, hi_block] +} diff --git a/fearless_simd_dev_macros/src/lib.rs b/fearless_simd_dev_macros/src/lib.rs index 1cb2aa9c..f43ca87c 100644 --- a/fearless_simd_dev_macros/src/lib.rs +++ b/fearless_simd_dev_macros/src/lib.rs @@ -80,7 +80,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { fn #sse4_name() { if std::arch::is_x86_feature_detected!("sse4.2") { let sse4 = unsafe { fearless_simd::x86::Sse4_2::new_unchecked() }; - #input_fn_name(sse4); + sse4.vectorize( + #[inline(always)] + || #input_fn_name(sse4) + ); } } }; @@ -94,7 +97,10 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { && std::arch::is_x86_feature_detected!("fma") { let avx2 = unsafe { fearless_simd::x86::Avx2::new_unchecked() }; - #input_fn_name(avx2); + avx2.vectorize( + #[inline(always)] + || #input_fn_name(avx2) + ); } } }; @@ -110,6 +116,7 @@ pub fn simd_test(_: TokenStream, item: TokenStream) -> TokenStream { }; quote! { + #[inline(always)] #input_fn #fallback_snippet diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs index 1900e171..9332d2ad 100644 --- a/fearless_simd_gen/src/generic.rs +++ b/fearless_simd_gen/src/generic.rs @@ -5,7 +5,7 @@ use proc_macro2::{Ident, Span, TokenStream}; use quote::{ToTokens, quote}; use crate::{ - ops::{Op, OpSig, RefKind}, + ops::{Op, OpSig, RefKind, SlideGranularity}, types::{ScalarType, VecType}, }; @@ -203,6 +203,32 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream { } OpSig::FromBytes => generic_from_bytes(method_sig, ty), OpSig::ToBytes => generic_to_bytes(method_sig, ty), + OpSig::Slide { granularity, .. } => { + match (granularity, ty.n_bits()) { + (SlideGranularity::WithinBlocks, 128) => { + // If this operation is done on a 128-bit vector type, the "within blocks" method is identical to the + // non-within-blocks one, so just defer to that. + let non_blockwise = generic_op_name("slide", ty); + quote! { + #method_sig { + self.#non_blockwise::(a, b) + } + } + } + (SlideGranularity::WithinBlocks, _) => { + quote! { + #method_sig { + let (a0, a1) = self.#split(a); + let (b0, b1) = self.#split(b); + self.#combine(self.#do_half::(a0, b0), self.#do_half::(a1, b1)) + } + } + } + _ => { + panic!("Item-wise shifts across blocks cannot be done via split/combine"); + } + } + } } } diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs index 7043a578..31e0b3fc 100644 --- a/fearless_simd_gen/src/level.rs +++ b/fearless_simd_gen/src/level.rs @@ -72,6 +72,12 @@ pub(crate) trait Level { } } + /// Any additional supporting code necessary for the module's implementation, but placed *after* the `Simd` + /// implementation itself. + fn make_module_footer(&self) -> TokenStream { + TokenStream::new() + } + /// The body of the `Simd::level` function. This can be overridden, e.g. to return `Level::baseline()` if we know a /// higher SIMD level is statically enabled. fn make_level_body(&self) -> TokenStream { @@ -209,6 +215,7 @@ pub(crate) trait Level { let arch_types_impl = self.impl_arch_types(); let simd_impl = self.make_simd_impl(); let ty_impl = self.make_type_impl(); + let footer = self.make_module_footer(); quote! { use crate::{prelude::*, seal::Seal, arch_types::ArchTypes, Level}; @@ -234,6 +241,8 @@ pub(crate) trait Level { #simd_impl #ty_impl + + #footer } } } diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs index cd253243..09c0fa8a 100644 --- a/fearless_simd_gen/src/mk_fallback.rs +++ b/fearless_simd_gen/src/mk_fallback.rs @@ -369,6 +369,17 @@ impl Level for Fallback { } } } + OpSig::Slide { .. } => { + let n = vec_ty.len; + quote! { + #method_sig { + let mut dest = [Default::default(); #n]; + dest[..#n - SHIFT].copy_from_slice(&a.val.0[SHIFT..]); + dest[#n - SHIFT..].copy_from_slice(&b.val.0[..SHIFT]); + dest.simd_into(self) + } + } + } OpSig::Cvt { target_ty, scalar_bits, diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 2e4af67a..d7fa9a86 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -9,7 +9,7 @@ use crate::generic::{ generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_to_bytes, }; use crate::level::Level; -use crate::ops::{Op, valid_reinterpret}; +use crate::ops::{Op, SlideGranularity, valid_reinterpret}; use crate::{ arch::neon::{self, cvt_intrinsic, simple_intrinsic, split_intrinsic}, ops::OpSig, @@ -66,6 +66,10 @@ impl Level for Neon { } } + fn make_module_footer(&self) -> TokenStream { + mk_slide_helpers() + } + fn make_impl_body(&self) -> TokenStream { quote! { #[inline] @@ -395,6 +399,72 @@ impl Level for Neon { } } } + OpSig::Slide { granularity } => { + use SlideGranularity::*; + + let block_wrapper = vec_ty.aligned_wrapper(); + let bytes_ty = vec_ty.reinterpret(ScalarType::Unsigned, 8); + let combined_bytes = bytes_ty.rust(); + let scalar_bytes = vec_ty.scalar_bits / 8; + let num_items = vec_ty.len; + let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); + let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); + + let byte_shift = if scalar_bytes == 1 { + quote! { SHIFT } + } else { + quote! { SHIFT * #scalar_bytes } + }; + + let bytes_expr = match (granularity, vec_ty.n_bits()) { + (WithinBlocks, 128) => { + panic!("This should have been handled by generic_op"); + } + (WithinBlocks, _) | (_, 128) => { + quote! { + unsafe { + dyn_vext_128(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift) + } + } + } + (AcrossBlocks, 256 | 512) => { + let num_blocks = vec_ty.n_bits() / 128; + + // Ranges are not `Copy`, so we need to create a new range iterator for each usage + let blocks = (0..num_blocks).map(Literal::usize_unsuffixed); + let blocks2 = blocks.clone(); + let blocks3 = blocks.clone(); + let bytes_arch_ty = self.arch_ty(&bytes_ty); + + quote! { + unsafe { + let a_bytes = self.#to_bytes(a).val.0; + let b_bytes = self.#to_bytes(b).val.0; + let a_blocks = [#( a_bytes.#blocks ),*]; + let b_blocks = [#( b_bytes.#blocks2 ),*]; + + let shift_bytes = #byte_shift; + #bytes_arch_ty(#({ + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a_blocks, &b_blocks, #blocks3, shift_bytes); + dyn_vext_128(lo, hi, shift_bytes % 16) + }),*) + } + } + } + _ => unimplemented!(), + }; + + quote! { + #method_sig { + if SHIFT >= #num_items { + return b; + } + + let result = #bytes_expr; + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + } + } + } OpSig::Cvt { target_ty, scalar_bits, @@ -478,3 +548,25 @@ impl Level for Neon { } } } + +fn mk_slide_helpers() -> TokenStream { + let shifts = (0_usize..16).map(|shift| { + let shift_i32 = i32::try_from(shift).unwrap(); + quote! { #shift => vextq_u8::<#shift_i32>(a, b) } + }); + + quote! { + /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still + /// expected to be constant in practice, so the match statement will be optimized out. This exists because + /// Rust doesn't currently let you do math on const generics. + #[inline(always)] + unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { + unsafe { + match shift { + #(#shifts,)* + _ => unreachable!() + } + } + } + } +} diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs index 1e9c39ad..b62a5ffb 100644 --- a/fearless_simd_gen/src/mk_simd_trait.rs +++ b/fearless_simd_gen/src/mk_simd_trait.rs @@ -176,6 +176,28 @@ fn mk_simd_base() -> TokenStream { /// calling `f` with that element's lane index (from 0 to /// [`SimdBase::N`] - 1). fn from_fn(simd: S, f: impl FnMut(usize) -> Self::Element) -> Self; + + /// Concatenate `[self, rhs]` and extract `Self::N` elements + /// starting at index `SHIFT`. + /// + /// `SHIFT` must be within [0, `Self::N`]. + /// + /// This can be used to implement a "shift items" operation by + /// providing all zeroes as one operand. For a left shift, the + /// right-hand side should be all zeroes. For a right shift by `M` + /// items, the left-hand side should be all zeroes, and the shift + /// amount will be `Self::N - M`. + /// + /// This can also be used to rotate items within a vector by + /// providing the same vector as both operands. + /// + /// ```text + /// slide::<1>([a b c d], [e f g h]) == [b c d e] + /// ``` + fn slide(self, rhs: impl SimdInto) -> Self; + /// Like [`slide`](SimdBase::slide), but operates independently on + /// each 128-bit block. + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self; } } } diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs index a1be080d..8fb24df6 100644 --- a/fearless_simd_gen/src/mk_simd_types.rs +++ b/fearless_simd_gen/src/mk_simd_types.rs @@ -291,6 +291,8 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream { let from_array_op = generic_op_name("load_array", ty); let as_array_ref_op = generic_op_name("as_array_ref", ty); let as_array_mut_op = generic_op_name("as_array_mut", ty); + let slide_op = generic_op_name("slide", ty); + let slide_blockwise_op = generic_op_name("slide_within_blocks", ty); quote! { impl SimdBase for #name { type Element = #scalar; @@ -334,6 +336,15 @@ fn simd_vec_impl(ty: &VecType) -> TokenStream { simd.#from_array_op(core::array::from_fn(f)) } + #[inline(always)] + fn slide(self, rhs: impl SimdInto) -> Self { + self.simd.#slide_op::(self, rhs.simd_into(self.simd)) + } + + #[inline(always)] + fn slide_within_blocks(self, rhs: impl SimdInto) -> Self { + self.simd.#slide_blockwise_op::(self, rhs.simd_into(self.simd)) + } } impl crate::#vec_trait_id for #name { #( #methods )* diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 61da3d70..81b9072c 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -1,7 +1,7 @@ // Copyright 2025 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT -use proc_macro2::{Ident, Span, TokenStream}; +use proc_macro2::{Ident, Literal, Span, TokenStream}; use quote::{format_ident, quote}; use crate::arch::wasm::{arch_prefix, v128_intrinsic}; @@ -10,7 +10,7 @@ use crate::generic::{ generic_from_bytes, generic_op_name, generic_to_bytes, scalar_binary, }; use crate::level::Level; -use crate::ops::{Op, Quantifier, valid_reinterpret}; +use crate::ops::{Op, Quantifier, SlideGranularity, valid_reinterpret}; use crate::{ arch::wasm::{self, simple_intrinsic}, ops::OpSig, @@ -55,6 +55,10 @@ impl Level for WasmSimd128 { } } + fn make_module_footer(&self) -> TokenStream { + mk_slide_helpers() + } + fn make_impl_body(&self) -> TokenStream { quote! { #[inline] @@ -360,6 +364,48 @@ impl Level for WasmSimd128 { } } } + OpSig::Slide { granularity } => { + use SlideGranularity::*; + + let block_wrapper = vec_ty.aligned_wrapper(); + let combined_bytes = vec_ty.reinterpret(ScalarType::Unsigned, 8).rust(); + let scalar_bytes = vec_ty.scalar_bits / 8; + let num_items = vec_ty.len; + let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); + let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); + + let slide_op = match (granularity, vec_ty.n_bits()) { + (WithinBlocks, 128) => { + panic!("This should have been handled by generic_op"); + } + (WithinBlocks, _) | (_, 128) => { + format_ident!("dyn_slide_{}", vec_ty.n_bits()) + } + (AcrossBlocks, 256 | 512) => { + format_ident!("cross_block_slide_128x{}", vec_ty.n_bits() / 128) + } + _ => unimplemented!(), + }; + + let byte_shift = if scalar_bytes == 1 { + quote! { SHIFT } + } else { + quote! { SHIFT * #scalar_bytes } + }; + + quote! { + #method_sig { + if SHIFT >= #num_items { + return b; + } + + unsafe { + let result = #slide_op(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + } + } + } + } OpSig::Cvt { target_ty, scalar_bits, @@ -628,3 +674,59 @@ impl Level for WasmSimd128 { } } } + +fn mk_slide_helpers() -> TokenStream { + let mut fns = vec![]; + + // This behaves like ARM's vext instruction: `a` is the "left" vector and `b` is the "right". + let shifts = (0_usize..16).map(|shift| { + let indices = (shift..shift + 16).map(Literal::usize_unsuffixed); + let shift_literal = Literal::usize_unsuffixed(shift); + quote! { #shift_literal => i8x16_shuffle::<#( #indices ),*>(a, b) } + }); + + fns.push(quote! { + /// This is a vector extend, like `vext` on ARM or `alignr` on x86, that takes a non-const shift argument. + /// The shift is still expected to be constant in practice, so the match statement will be optimized out. + /// This exists because Rust doesn't currently let you do math on const generics. + #[inline(always)] + unsafe fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 { + unsafe { + match shift { + #(#shifts,)* + _ => unreachable!() + } + } + } + }); + + // Generate cross_block_alignr helper for N=2 and N=4 (256-bit and 512-bit vectors) + for num_blocks in [2_usize, 4_usize] { + let helper_name = format_ident!("cross_block_slide_128x{}", num_blocks); + + // Generate the explicit unrolled calls for each output block + let block_calls: Vec<_> = (0..num_blocks) + .map(|i| { + quote! { + { + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, #i, shift_bytes); + unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + } + } + }) + .collect(); + + fns.push(quote! { + /// Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`. + #[inline(always)] + unsafe fn #helper_name(a: [v128; #num_blocks], b: [v128; #num_blocks], shift_bytes: usize) -> [v128; #num_blocks] { + // Explicitly unrolled to help LLVM optimize + [#(#block_calls),*] + } + }); + } + + quote! { + #( #fns )* + } +} diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index 15973f6e..b89a02d3 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -11,10 +11,10 @@ use crate::generic::{ generic_from_bytes, generic_op_name, generic_to_bytes, scalar_binary, }; use crate::level::Level; -use crate::ops::{Op, OpSig, Quantifier, valid_reinterpret}; +use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret}; use crate::types::{ScalarType, VecType}; use proc_macro2::{Ident, Span, TokenStream}; -use quote::{ToTokens as _, quote}; +use quote::{ToTokens as _, format_ident, quote}; #[derive(Clone, Copy, PartialEq, Eq)] pub(crate) enum X86 { @@ -82,6 +82,19 @@ impl Level for X86 { } } + fn make_module_footer(&self) -> TokenStream { + let alignr_helpers = self.dyn_alignr_helpers(); + let slide_helpers = match self { + X86::Sse4_2 => Self::sse42_slide_helpers(), + X86::Avx2 => Self::avx2_slide_helpers(), + }; + + quote! { + #alignr_helpers + #slide_helpers + } + } + fn make_level_body(&self) -> TokenStream { let level_tok = self.token(); match self { @@ -149,6 +162,7 @@ impl Level for X86 { OpSig::Split { half_ty } => self.handle_split(method_sig, vec_ty, &half_ty), OpSig::Zip { select_low } => self.handle_zip(method_sig, vec_ty, select_low), OpSig::Unzip { select_even } => self.handle_unzip(method_sig, vec_ty, select_even), + OpSig::Slide { granularity } => self.handle_slide(method_sig, vec_ty, granularity), OpSig::Cvt { target_ty, scalar_bits, @@ -941,6 +955,65 @@ impl X86 { } } + pub(crate) fn handle_slide( + &self, + method_sig: TokenStream, + vec_ty: &VecType, + granularity: SlideGranularity, + ) -> TokenStream { + use SlideGranularity::*; + + let block_wrapper = vec_ty.aligned_wrapper(); + let combined_bytes = vec_ty.reinterpret(ScalarType::Unsigned, 8).rust(); + let scalar_bytes = vec_ty.scalar_bits / 8; + let max_shift = match granularity { + WithinBlocks => vec_ty.len / (vec_ty.n_bits() / 128), + AcrossBlocks => vec_ty.len, + }; + let to_bytes = generic_op_name("cvt_to_bytes", vec_ty); + let from_bytes = generic_op_name("cvt_from_bytes", vec_ty); + + let alignr_op = match (granularity, vec_ty.n_bits(), self) { + (WithinBlocks, 128, _) => { + panic!("This should have been handled by generic_op"); + } + (WithinBlocks, _, _) | (_, 128, _) => { + // For WithinBlocks, use elements per 128-bit block; for 128-bit vectors, use total elements + format_ident!("dyn_alignr_{}", vec_ty.n_bits()) + } + (AcrossBlocks, 256 | 512, Self::Sse4_2) => { + // Inter-block shift or rotate in SSE4.2: use cross_block_alignr + + format_ident!("cross_block_alignr_128x{}", vec_ty.n_bits() / 128) + } + (AcrossBlocks, 256 | 512, Self::Avx2) => { + format_ident!("cross_block_alignr_256x{}", vec_ty.n_bits() / 256) + } + _ => unimplemented!(), + }; + let byte_shift = if scalar_bytes == 1 { + quote! { SHIFT } + } else { + quote! { SHIFT * #scalar_bytes } + }; + + quote! { + #method_sig { + unsafe { + if SHIFT >= #max_shift { + return b; + } + + // b and a are swapped here to match ARM's vext semantics. For vext, we can think of `a` as the "left", + // and we concatenate `b` to its "right". This makes sense, since `a` is the left-hand side and `b` is + // the right-hand side. x86's `alignr` is backwards, and treats `b` as the high/left block. + let result = #alignr_op(self.#to_bytes(b).val.0, self.#to_bytes(a).val.0, #byte_shift); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + } + } + } + } + pub(crate) fn handle_cvt( &self, method_sig: TokenStream, @@ -1489,4 +1562,127 @@ impl X86 { } } } + + fn dyn_alignr_helpers(&self) -> TokenStream { + let mut fns = vec![]; + + let vec_widths: &[usize] = match self { + X86::Sse4_2 => &[128], + X86::Avx2 => &[128, 256], + }; + + for vec_ty in vec_widths + .iter() + .map(|n| VecType::new(ScalarType::Int, 8, *n / 8)) + { + let arch_ty = self.arch_ty(&vec_ty); + + let helper_name = format_ident!("dyn_alignr_{}", vec_ty.n_bits()); + let alignr_intrinsic = simple_sign_unaware_intrinsic("alignr", &vec_ty); + let shifts = (0_usize..16).map(|shift| { + let shift_i32 = i32::try_from(shift).unwrap(); + quote! { #shift => #alignr_intrinsic::<#shift_i32>(a, b) } + }); + + fns.push(quote! { + /// This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still + /// expected to be constant in practice, so the match statement will be optimized out. This exists because + /// Rust doesn't currently let you do math on const generics. + #[inline(always)] + unsafe fn #helper_name(a: #arch_ty, b: #arch_ty, shift: usize) -> #arch_ty { + unsafe { + match shift { + #(#shifts,)* + _ => unreachable!() + } + } + } + }); + } + + quote! { #( #fns )* } + } + + fn sse42_slide_helpers() -> TokenStream { + let mut fns = vec![]; + + for num_blocks in [2_usize, 4_usize] { + let helper_name = format_ident!("cross_block_alignr_128x{}", num_blocks); + let blocks_idx = 0..num_blocks; + + // Unroll the construction of the blocks. I tried using `array::from_fn`, but the compiler thought the + // closure was too big and didn't inline it. + fns.push(quote! { + /// Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`. + /// Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics. + #[inline(always)] + unsafe fn #helper_name(a: [__m128i; #num_blocks], b: [__m128i; #num_blocks], shift_bytes: usize) -> [__m128i; #num_blocks] { + [#({ + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, #blocks_idx, shift_bytes); + unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } + }),*] + } + }); + } + + quote! { + #(#fns)* + } + } + + fn avx2_slide_helpers() -> TokenStream { + quote! { + /// Computes one output __m256i for `cross_block_alignr_*` operations. + /// + /// Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and + /// `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`. + #[inline(always)] + unsafe fn cross_block_alignr_one(regs: &[__m256i], block_idx: usize, shift_bytes: usize) -> __m256i { + let lo_idx = block_idx + (shift_bytes / 16); + let intra_shift = shift_bytes % 16; + let lo_blocks = if lo_idx % 2 == 0 { + regs[lo_idx / 2] + } else { + unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } + }; + + // For hi_blocks, we need blocks (`lo_idx + 1`) and (`lo_idx + 2`) + let hi_idx = lo_idx + 1; + let hi_blocks = if hi_idx % 2 == 0 { + regs[hi_idx / 2] + } else { + unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) } + }; + + unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } + } + + /// Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset + /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. + #[inline(always)] + unsafe fn cross_block_alignr_256x2(a: [__m256i; 2], b: [__m256i; 2], shift_bytes: usize) -> [__m256i; 2] { + // Concatenation is [b : a], so b blocks come first + let regs = [b[0], b[1], a[0], a[1]]; + + unsafe { + [ + cross_block_alignr_one(®s, 0, shift_bytes), + cross_block_alignr_one(®s, 2, shift_bytes), + ] + } + } + + /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset + /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. + #[inline(always)] + unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { + // Concatenation is [b : a], so b comes first + let regs = [b, a]; + + unsafe { + cross_block_alignr_one(®s, 0, shift_bytes) + } + } + } + } } diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index a7462f47..e637ef7c 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -43,6 +43,12 @@ impl RefKind { } } +#[derive(Clone, Copy, PartialEq, Eq)] +pub(crate) enum SlideGranularity { + WithinBlocks, + AcrossBlocks, +} + #[derive(Clone, Copy)] pub(crate) enum OpSig { /// Takes a single argument of the underlying SIMD element type, and returns the corresponding vector type. @@ -66,6 +72,8 @@ pub(crate) enum OpSig { Zip { select_low: bool }, /// Takes two arguments of a vector type, and returns that same vector type. Unzip { select_even: bool }, + /// Takes two arguments of a vector type, plus a const generic shift amount, and returns that same vector type. + Slide { granularity: SlideGranularity }, /// Takes a single argument of the source vector type, and returns a vector type of the target scalar type and the /// same length. Cvt { @@ -211,6 +219,11 @@ impl Op { let arg1 = &arg_names[1]; quote! { (self, #arg0: #ty, #arg1: #ty) -> #ty } } + OpSig::Slide { .. } => { + let arg0 = &arg_names[0]; + let arg1 = &arg_names[1]; + quote! { (self, #arg0: #ty, #arg1: #ty) -> #ty } + } OpSig::Cvt { target_ty, scalar_bits, @@ -316,6 +329,11 @@ impl Op { let arg1 = &arg_names[1]; quote! { (#arg0, #arg1: impl SimdInto) -> Self } } + OpSig::Slide { .. } => { + let arg0 = &arg_names[0]; + let arg1 = &arg_names[1]; + quote! { (#arg0, #arg1: impl SimdInto) -> Self } + } OpSig::Compare => { let arg0 = &arg_names[0]; let arg1 = &arg_names[1]; @@ -446,6 +464,22 @@ const BASE_OPS: &[Op] = &[ OpSig::ToBytes, "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length.", ), + Op::new( + "slide", + OpKind::BaseTraitMethod, + OpSig::Slide { + granularity: SlideGranularity::AcrossBlocks, + }, + "", + ), + Op::new( + "slide_within_blocks", + OpKind::BaseTraitMethod, + OpSig::Slide { + granularity: SlideGranularity::WithinBlocks, + }, + "", + ), ]; const FLOAT_OPS: &[Op] = &[ @@ -1282,10 +1316,23 @@ impl OpSig { | Self::StoreInterleaved { .. } | Self::FromArray { .. } | Self::AsArray { .. } + | Self::Slide { + granularity: SlideGranularity::AcrossBlocks, + .. + } ) { return false; } + // For a block-wise item slide/shift, defer to the non-block-wise version if the operand is 1 block wide anyway + if let Self::Slide { + granularity: SlideGranularity::WithinBlocks, + } = self + && vec_ty.n_bits() == 128 + { + return true; + } + // Otherwise, defer to split/combine if this is a wider operation than natively supported. if vec_ty.n_bits() <= native_width { return false; @@ -1310,7 +1357,8 @@ impl OpSig { | Self::Compare | Self::Combine { .. } | Self::Zip { .. } - | Self::Unzip { .. } => &["a", "b"], + | Self::Unzip { .. } + | Self::Slide { .. } => &["a", "b"], Self::Ternary | Self::Select => &["a", "b", "c"], Self::Shift => &["a", "shift"], Self::LoadInterleaved { .. } => &["src"], @@ -1331,9 +1379,11 @@ impl OpSig { | Self::MaskReduce { .. } | Self::AsArray { .. } | Self::ToBytes => &["self"], - Self::Binary | Self::Compare | Self::Zip { .. } | Self::Unzip { .. } => { - &["self", "rhs"] - } + Self::Binary + | Self::Compare + | Self::Zip { .. } + | Self::Unzip { .. } + | Self::Slide { .. } => &["self", "rhs"], Self::Shift => &["self", "shift"], Self::Ternary => &["self", "op1", "op2"], Self::Select | Self::Split { .. } | Self::Combine { .. } => &[], @@ -1378,7 +1428,8 @@ impl OpSig { | Self::FromArray { .. } | Self::AsArray { .. } | Self::FromBytes - | Self::ToBytes => return None, + | Self::ToBytes + | Self::Slide { .. } => return None, }; Some(args) } diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 233a16be..5d526fae 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -2984,3 +2984,495 @@ fn index_consistency(simd: S) { assert_eq!(i, *v.index_mut(i) as usize); } } + +// ==================== Slide tests ==================== + +#[simd_test] +fn slide_f32x4(simd: S) { + let a = f32x4::from_slice(simd, &[1.0, 2.0, 3.0, 4.0]); + let b = f32x4::from_slice(simd, &[5.0, 6.0, 7.0, 8.0]); + assert_eq!(*a.slide::<0>(b), [1.0, 2.0, 3.0, 4.0]); // returns a + assert_eq!(*a.slide::<1>(b), [2.0, 3.0, 4.0, 5.0]); + assert_eq!(*a.slide::<2>(b), [3.0, 4.0, 5.0, 6.0]); + assert_eq!(*a.slide::<4>(b), [5.0, 6.0, 7.0, 8.0]); // returns b +} + +#[simd_test] +fn slide_f32x8(simd: S) { + let a = f32x8::from_slice(simd, &[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); + let b = f32x8::from_slice(simd, &[9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]); + assert_eq!(*a.slide::<0>(b), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]); + assert_eq!(*a.slide::<1>(b), [2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]); + assert_eq!( + *a.slide::<4>(b), + [5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0] + ); + assert_eq!( + *a.slide::<7>(b), + [8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0] + ); + assert_eq!( + *a.slide::<8>(b), + [9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0] + ); +} + +#[simd_test] +fn slide_f32x16(simd: S) { + let a: Vec = (1_i16..=16).map(f32::from).collect(); + let b: Vec = (17_i16..=32).map(f32::from).collect(); + let a = f32x16::from_slice(simd, &a); + let b = f32x16::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=16).map(f32::from).collect(); + let expected_1: Vec = (2_i16..=17).map(f32::from).collect(); + let expected_8: Vec = (9_i16..=24).map(f32::from).collect(); + let expected_15: Vec = (16_i16..=31).map(f32::from).collect(); + let expected_16: Vec = (17_i16..=32).map(f32::from).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<1>(b).as_slice(), &expected_1); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<15>(b).as_slice(), &expected_15); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_f64x2(simd: S) { + let a = f64x2::from_slice(simd, &[1.0, 2.0]); + let b = f64x2::from_slice(simd, &[3.0, 4.0]); + assert_eq!(*a.slide::<0>(b), [1.0, 2.0]); + assert_eq!(*a.slide::<1>(b), [2.0, 3.0]); + assert_eq!(*a.slide::<2>(b), [3.0, 4.0]); +} + +#[simd_test] +fn slide_f64x4(simd: S) { + let a = f64x4::from_slice(simd, &[1.0, 2.0, 3.0, 4.0]); + let b = f64x4::from_slice(simd, &[5.0, 6.0, 7.0, 8.0]); + assert_eq!(*a.slide::<0>(b), [1.0, 2.0, 3.0, 4.0]); + assert_eq!(*a.slide::<1>(b), [2.0, 3.0, 4.0, 5.0]); + assert_eq!(*a.slide::<2>(b), [3.0, 4.0, 5.0, 6.0]); + assert_eq!(*a.slide::<3>(b), [4.0, 5.0, 6.0, 7.0]); + assert_eq!(*a.slide::<4>(b), [5.0, 6.0, 7.0, 8.0]); +} + +#[simd_test] +fn slide_f64x8(simd: S) { + let a: Vec = (1..=8).map(f64::from).collect(); + let b: Vec = (9..=16).map(f64::from).collect(); + let a = f64x8::from_slice(simd, &a); + let b = f64x8::from_slice(simd, &b); + let expected_0: Vec = (1..=8).map(f64::from).collect(); + let expected_1: Vec = (2..=9).map(f64::from).collect(); + let expected_4: Vec = (5..=12).map(f64::from).collect(); + let expected_7: Vec = (8..=15).map(f64::from).collect(); + let expected_8: Vec = (9..=16).map(f64::from).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<1>(b).as_slice(), &expected_1); + assert_eq!(a.slide::<4>(b).as_slice(), &expected_4); + assert_eq!(a.slide::<7>(b).as_slice(), &expected_7); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); +} + +#[simd_test] +fn slide_i8x16(simd: S) { + let a: Vec = (1_i8..=16).collect(); + let b: Vec = (17_i8..=32).collect(); + let a = i8x16::from_slice(simd, &a); + let b = i8x16::from_slice(simd, &b); + let expected_0: Vec = (1_i8..=16).collect(); + let expected_1: Vec = (2_i8..=17).collect(); + let expected_8: Vec = (9_i8..=24).collect(); + let expected_15: Vec = (16_i8..=31).collect(); + let expected_16: Vec = (17_i8..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<1>(b).as_slice(), &expected_1); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<15>(b).as_slice(), &expected_15); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_i8x32(simd: S) { + let a: Vec = (1_i8..=32).collect(); + let b: Vec = (33_i8..=64).collect(); + let a = i8x32::from_slice(simd, &a); + let b = i8x32::from_slice(simd, &b); + let expected_0: Vec = (1_i8..=32).collect(); + let expected_1: Vec = (2_i8..=33).collect(); + let expected_16: Vec = (17_i8..=48).collect(); + let expected_31: Vec = (32_i8..=63).collect(); + let expected_32: Vec = (33_i8..=64).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<1>(b).as_slice(), &expected_1); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); + assert_eq!(a.slide::<31>(b).as_slice(), &expected_31); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); +} + +#[simd_test] +fn slide_i8x64(simd: S) { + let a: Vec = (0_i8..=63).collect(); + let b: Vec = (64_i8..=127).collect(); + let a = i8x64::from_slice(simd, &a); + let b = i8x64::from_slice(simd, &b); + let expected_0: Vec = (0_i8..=63).collect(); + let expected_1: Vec = (1_i8..=64).collect(); + let expected_32: Vec = (32_i8..=95).collect(); + let expected_63: Vec = (63_i8..=126).collect(); + let expected_64: Vec = (64_i8..=127).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<1>(b).as_slice(), &expected_1); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); + assert_eq!(a.slide::<63>(b).as_slice(), &expected_63); + assert_eq!(a.slide::<64>(b).as_slice(), &expected_64); +} + +#[simd_test] +fn slide_u8x16(simd: S) { + let a: Vec = (1..=16).collect(); + let b: Vec = (17..=32).collect(); + let a = u8x16::from_slice(simd, &a); + let b = u8x16::from_slice(simd, &b); + let expected_0: Vec = (1..=16).collect(); + let expected_8: Vec = (9..=24).collect(); + let expected_16: Vec = (17..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_u8x32(simd: S) { + let a: Vec = (1..=32).collect(); + let b: Vec = (33..=64).collect(); + let a = u8x32::from_slice(simd, &a); + let b = u8x32::from_slice(simd, &b); + let expected_0: Vec = (1..=32).collect(); + let expected_16: Vec = (17..=48).collect(); + let expected_32: Vec = (33..=64).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); +} + +#[simd_test] +fn slide_u8x64(simd: S) { + let a: Vec = (1..=64).collect(); + let b: Vec = (65..=128).collect(); + let a = u8x64::from_slice(simd, &a); + let b = u8x64::from_slice(simd, &b); + let expected_0: Vec = (1..=64).collect(); + let expected_32: Vec = (33..=96).collect(); + let expected_64: Vec = (65..=128).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); + assert_eq!(a.slide::<64>(b).as_slice(), &expected_64); +} + +#[simd_test] +fn slide_i16x8(simd: S) { + let a: Vec = (1_i16..=8).collect(); + let b: Vec = (9_i16..=16).collect(); + let a = i16x8::from_slice(simd, &a); + let b = i16x8::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=8).collect(); + let expected_1: Vec = (2_i16..=9).collect(); + let expected_4: Vec = (5_i16..=12).collect(); + let expected_7: Vec = (8_i16..=15).collect(); + let expected_8: Vec = (9_i16..=16).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<1>(b).as_slice(), &expected_1); + assert_eq!(a.slide::<4>(b).as_slice(), &expected_4); + assert_eq!(a.slide::<7>(b).as_slice(), &expected_7); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); +} + +#[simd_test] +fn slide_i16x16(simd: S) { + let a: Vec = (1_i16..=16).collect(); + let b: Vec = (17_i16..=32).collect(); + let a = i16x16::from_slice(simd, &a); + let b = i16x16::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=16).collect(); + let expected_8: Vec = (9_i16..=24).collect(); + let expected_16: Vec = (17_i16..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_i16x32(simd: S) { + let a: Vec = (1_i16..=32).collect(); + let b: Vec = (33_i16..=64).collect(); + let a = i16x32::from_slice(simd, &a); + let b = i16x32::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=32).collect(); + let expected_16: Vec = (17_i16..=48).collect(); + let expected_32: Vec = (33_i16..=64).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); +} + +#[simd_test] +fn slide_u16x8(simd: S) { + let a: Vec = (1..=8).collect(); + let b: Vec = (9..=16).collect(); + let a = u16x8::from_slice(simd, &a); + let b = u16x8::from_slice(simd, &b); + let expected_0: Vec = (1..=8).collect(); + let expected_4: Vec = (5..=12).collect(); + let expected_8: Vec = (9..=16).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<4>(b).as_slice(), &expected_4); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); +} + +#[simd_test] +fn slide_u16x16(simd: S) { + let a: Vec = (1..=16).collect(); + let b: Vec = (17..=32).collect(); + let a = u16x16::from_slice(simd, &a); + let b = u16x16::from_slice(simd, &b); + let expected_0: Vec = (1..=16).collect(); + let expected_8: Vec = (9..=24).collect(); + let expected_16: Vec = (17..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_u16x32(simd: S) { + let a: Vec = (1..=32).collect(); + let b: Vec = (33..=64).collect(); + let a = u16x32::from_slice(simd, &a); + let b = u16x32::from_slice(simd, &b); + let expected_0: Vec = (1..=32).collect(); + let expected_16: Vec = (17..=48).collect(); + let expected_32: Vec = (33..=64).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); +} + +#[simd_test] +fn slide_i32x4(simd: S) { + let a = i32x4::from_slice(simd, &[1, 2, 3, 4]); + let b = i32x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4]); + assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5]); + assert_eq!(*a.slide::<2>(b), [3, 4, 5, 6]); + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8]); +} + +#[simd_test] +fn slide_i32x8(simd: S) { + let a = i32x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = i32x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*a.slide::<1>(b), [2, 3, 4, 5, 6, 7, 8, 9]); + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8, 9, 10, 11, 12]); + assert_eq!(*a.slide::<7>(b), [8, 9, 10, 11, 12, 13, 14, 15]); + assert_eq!(*a.slide::<8>(b), [9, 10, 11, 12, 13, 14, 15, 16]); +} + +#[simd_test] +fn slide_i32x16(simd: S) { + let a: Vec = (1..=16).collect(); + let b: Vec = (17..=32).collect(); + let a = i32x16::from_slice(simd, &a); + let b = i32x16::from_slice(simd, &b); + let expected_0: Vec = (1..=16).collect(); + let expected_8: Vec = (9..=24).collect(); + let expected_16: Vec = (17..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_u32x4(simd: S) { + let a = u32x4::from_slice(simd, &[1, 2, 3, 4]); + let b = u32x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4]); + assert_eq!(*a.slide::<2>(b), [3, 4, 5, 6]); + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8]); +} + +#[simd_test] +fn slide_u32x8(simd: S) { + let a = u32x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = u32x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8, 9, 10, 11, 12]); + assert_eq!(*a.slide::<8>(b), [9, 10, 11, 12, 13, 14, 15, 16]); +} + +#[simd_test] +fn slide_u32x16(simd: S) { + let a: Vec = (1..=16).collect(); + let b: Vec = (17..=32).collect(); + let a = u32x16::from_slice(simd, &a); + let b = u32x16::from_slice(simd, &b); + let expected_0: Vec = (1..=16).collect(); + let expected_8: Vec = (9..=24).collect(); + let expected_16: Vec = (17..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_mask8x16(simd: S) { + let a: Vec = (1_i8..=16).collect(); + let b: Vec = (17_i8..=32).collect(); + let a = mask8x16::from_slice(simd, &a); + let b = mask8x16::from_slice(simd, &b); + let expected_0: Vec = (1_i8..=16).collect(); + let expected_8: Vec = (9_i8..=24).collect(); + let expected_16: Vec = (17_i8..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_mask8x32(simd: S) { + let a: Vec = (1_i8..=32).collect(); + let b: Vec = (33_i8..=64).collect(); + let a = mask8x32::from_slice(simd, &a); + let b = mask8x32::from_slice(simd, &b); + let expected_0: Vec = (1_i8..=32).collect(); + let expected_16: Vec = (17_i8..=48).collect(); + let expected_32: Vec = (33_i8..=64).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); +} + +#[simd_test] +fn slide_mask8x64(simd: S) { + let a: Vec = (0_i8..=63).collect(); + let b: Vec = (64_i8..=127).collect(); + let a = mask8x64::from_slice(simd, &a); + let b = mask8x64::from_slice(simd, &b); + let expected_0: Vec = (0_i8..=63).collect(); + let expected_32: Vec = (32_i8..=95).collect(); + let expected_64: Vec = (64_i8..=127).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); + assert_eq!(a.slide::<64>(b).as_slice(), &expected_64); +} + +#[simd_test] +fn slide_mask16x8(simd: S) { + let a: Vec = (1_i16..=8).collect(); + let b: Vec = (9_i16..=16).collect(); + let a = mask16x8::from_slice(simd, &a); + let b = mask16x8::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=8).collect(); + let expected_4: Vec = (5_i16..=12).collect(); + let expected_8: Vec = (9_i16..=16).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<4>(b).as_slice(), &expected_4); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); +} + +#[simd_test] +fn slide_mask16x16(simd: S) { + let a: Vec = (1_i16..=16).collect(); + let b: Vec = (17_i16..=32).collect(); + let a = mask16x16::from_slice(simd, &a); + let b = mask16x16::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=16).collect(); + let expected_8: Vec = (9_i16..=24).collect(); + let expected_16: Vec = (17_i16..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_mask16x32(simd: S) { + let a: Vec = (1_i16..=32).collect(); + let b: Vec = (33_i16..=64).collect(); + let a = mask16x32::from_slice(simd, &a); + let b = mask16x32::from_slice(simd, &b); + let expected_0: Vec = (1_i16..=32).collect(); + let expected_16: Vec = (17_i16..=48).collect(); + let expected_32: Vec = (33_i16..=64).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); + assert_eq!(a.slide::<32>(b).as_slice(), &expected_32); +} + +#[simd_test] +fn slide_mask32x4(simd: S) { + let a = mask32x4::from_slice(simd, &[1, 2, 3, 4]); + let b = mask32x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4]); + assert_eq!(*a.slide::<2>(b), [3, 4, 5, 6]); + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8]); +} + +#[simd_test] +fn slide_mask32x8(simd: S) { + let a = mask32x8::from_slice(simd, &[1, 2, 3, 4, 5, 6, 7, 8]); + let b = mask32x8::from_slice(simd, &[9, 10, 11, 12, 13, 14, 15, 16]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4, 5, 6, 7, 8]); + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8, 9, 10, 11, 12]); + assert_eq!(*a.slide::<8>(b), [9, 10, 11, 12, 13, 14, 15, 16]); +} + +#[simd_test] +fn slide_mask32x16(simd: S) { + let a: Vec = (1..=16).collect(); + let b: Vec = (17..=32).collect(); + let a = mask32x16::from_slice(simd, &a); + let b = mask32x16::from_slice(simd, &b); + let expected_0: Vec = (1..=16).collect(); + let expected_8: Vec = (9..=24).collect(); + let expected_16: Vec = (17..=32).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); + assert_eq!(a.slide::<16>(b).as_slice(), &expected_16); +} + +#[simd_test] +fn slide_mask64x2(simd: S) { + let a = mask64x2::from_slice(simd, &[1, 2]); + let b = mask64x2::from_slice(simd, &[3, 4]); + assert_eq!(*a.slide::<0>(b), [1, 2]); + assert_eq!(*a.slide::<1>(b), [2, 3]); + assert_eq!(*a.slide::<2>(b), [3, 4]); +} + +#[simd_test] +fn slide_mask64x4(simd: S) { + let a = mask64x4::from_slice(simd, &[1, 2, 3, 4]); + let b = mask64x4::from_slice(simd, &[5, 6, 7, 8]); + assert_eq!(*a.slide::<0>(b), [1, 2, 3, 4]); + assert_eq!(*a.slide::<2>(b), [3, 4, 5, 6]); // crosses block + assert_eq!(*a.slide::<4>(b), [5, 6, 7, 8]); +} + +#[simd_test] +fn slide_mask64x8(simd: S) { + let a: Vec = (1..=8).collect(); + let b: Vec = (9..=16).collect(); + let a = mask64x8::from_slice(simd, &a); + let b = mask64x8::from_slice(simd, &b); + let expected_0: Vec = (1..=8).collect(); + let expected_4: Vec = (5..=12).collect(); + let expected_8: Vec = (9..=16).collect(); + assert_eq!(a.slide::<0>(b).as_slice(), &expected_0); + assert_eq!(a.slide::<4>(b).as_slice(), &expected_4); + assert_eq!(a.slide::<8>(b).as_slice(), &expected_8); +} + +// Because the slide amount is a const generic, the exhaustive tests have to *compile* one slide per amount per vector +// type. Disable them entirely.` +#[cfg(false)] +mod slide_exhaustive; diff --git a/fearless_simd_tests/tests/harness/slide_exhaustive.rs b/fearless_simd_tests/tests/harness/slide_exhaustive.rs new file mode 100644 index 00000000..1b82d454 --- /dev/null +++ b/fearless_simd_tests/tests/harness/slide_exhaustive.rs @@ -0,0 +1,292 @@ +// Copyright 2025 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Exhaustive tests for the "slide" operations. + +use fearless_simd::*; +use fearless_simd_dev_macros::simd_test; + +/// Helper macro for testing individual slide operations +macro_rules! test_vector_slide { + ($test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident, $shift:literal) => {{ + #[inline(never)] + fn do_test( + test_vec_a: $vec_ty, + test_vec_b: $vec_ty, + fallback_vec_a: $vec_ty, + fallback_vec_b: $vec_ty, + ) { + assert_eq!( + core::hint::black_box( + test_vec_a + .witness() + .vectorize(|| test_vec_a.slide::<$shift>(test_vec_b)) + .as_slice() + ), + core::hint::black_box(fallback_vec_a.slide::<$shift>(fallback_vec_b).as_slice()), + "slide::<{}> mismatch", + $shift + ); + } + + do_test($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b) + }}; +} + +macro_rules! test_block_slide { + ($test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident, $shift:literal) => {{ + // A bunch of weird stuff here is to prevent rustc/LLVM from inlining everything and generating enormous amounts + // of code. Since all these tests are run one after the other in macro-generated code, we want to provide some + // level of isolation between them to prevent the compiler from trying to optimize a huge function containing + // all of them. So, each test is run in its own `#[inline(never)]` helper function. The `black_box` is to + // prevent the compiler from unrolling the slice comparison into (up to 64!) individual inline comparisons, and + // forcing a memcmp instead. + #[inline(never)] + fn do_test( + test_vec_a: $vec_ty, + test_vec_b: $vec_ty, + fallback_vec_a: $vec_ty, + fallback_vec_b: $vec_ty, + ) { + assert_eq!( + core::hint::black_box( + test_vec_a + .witness() + .vectorize(|| test_vec_a.slide_within_blocks::<$shift>(test_vec_b)) + .as_slice() + ), + core::hint::black_box( + fallback_vec_a + .slide_within_blocks::<$shift>(fallback_vec_b) + .as_slice() + ), + "slide_within_blocks::<{}> mismatch", + $shift + ); + } + + do_test($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b) + }}; +} + +/// Macro to iterate over shift values for slide (0 to N) +/// For slide, valid shifts are 0..=N where N is the number of elements +macro_rules! for_each_slide { + // For 2-element vectors: shifts 0..=2 + (@n2 $callback:ident!($($args:tt)*)) => { + $callback!($($args)* 0); + $callback!($($args)* 1); + $callback!($($args)* 2); + }; + // For 4-element vectors: shifts 0..=4 + (@n4 $callback:ident!($($args:tt)*)) => { + for_each_slide!(@n2 $callback!($($args)*)); + $callback!($($args)* 3); + $callback!($($args)* 4); + }; + // For 8-element vectors: shifts 0..=8 + (@n8 $callback:ident!($($args:tt)*)) => { + for_each_slide!(@n4 $callback!($($args)*)); + $callback!($($args)* 5); + $callback!($($args)* 6); + $callback!($($args)* 7); + $callback!($($args)* 8); + }; + // For 16-element vectors: shifts 0..=16 + (@n16 $callback:ident!($($args:tt)*)) => { + for_each_slide!(@n8 $callback!($($args)*)); + $callback!($($args)* 9); + $callback!($($args)* 10); + $callback!($($args)* 11); + $callback!($($args)* 12); + $callback!($($args)* 13); + $callback!($($args)* 14); + $callback!($($args)* 15); + $callback!($($args)* 16); + }; + // For 32-element vectors: shifts 0..=32 + (@n32 $callback:ident!($($args:tt)*)) => { + for_each_slide!(@n16 $callback!($($args)*)); + $callback!($($args)* 17); + $callback!($($args)* 18); + $callback!($($args)* 19); + $callback!($($args)* 20); + $callback!($($args)* 21); + $callback!($($args)* 22); + $callback!($($args)* 23); + $callback!($($args)* 24); + $callback!($($args)* 25); + $callback!($($args)* 26); + $callback!($($args)* 27); + $callback!($($args)* 28); + $callback!($($args)* 29); + $callback!($($args)* 30); + $callback!($($args)* 31); + $callback!($($args)* 32); + }; + // For 64-element vectors: shifts 0..=64 + (@n64 $callback:ident!($($args:tt)*)) => { + for_each_slide!(@n32 $callback!($($args)*)); + $callback!($($args)* 33); + $callback!($($args)* 34); + $callback!($($args)* 35); + $callback!($($args)* 36); + $callback!($($args)* 37); + $callback!($($args)* 38); + $callback!($($args)* 39); + $callback!($($args)* 40); + $callback!($($args)* 41); + $callback!($($args)* 42); + $callback!($($args)* 43); + $callback!($($args)* 44); + $callback!($($args)* 45); + $callback!($($args)* 46); + $callback!($($args)* 47); + $callback!($($args)* 48); + $callback!($($args)* 49); + $callback!($($args)* 50); + $callback!($($args)* 51); + $callback!($($args)* 52); + $callback!($($args)* 53); + $callback!($($args)* 54); + $callback!($($args)* 55); + $callback!($($args)* 56); + $callback!($($args)* 57); + $callback!($($args)* 58); + $callback!($($args)* 59); + $callback!($($args)* 60); + $callback!($($args)* 61); + $callback!($($args)* 62); + $callback!($($args)* 63); + $callback!($($args)* 64); + }; +} + +/// Main macro for testing slide operations +macro_rules! test_slide_impl { + // Vector-wide operations + (@vec2 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n2 test_vector_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@vec4 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n4 test_vector_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@vec8 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n8 test_vector_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@vec16 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n16 test_vector_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@vec32 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n32 test_vector_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@vec64 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n64 test_vector_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + + // Within-block operations + (@block2 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n2 test_block_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@block4 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n4 test_block_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@block8 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n8 test_block_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; + (@block16 $test_vec_a:expr, $test_vec_b:expr, $fallback_vec_a:expr, $fallback_vec_b:expr, $vec_ty:ident) => { + for_each_slide!(@n16 test_block_slide!($test_vec_a, $test_vec_b, $fallback_vec_a, $fallback_vec_b, $vec_ty,)); + }; +} + +/// Generate a test function for slide exhaustive testing +macro_rules! test_slide_exhaustive { + ($test_name:ident, $vec_ty:ident, $elem_ty:ty, $n_elems:literal, $vec_n:ident, $block_n:ident) => { + #[simd_test] + fn $test_name(simd: S) { + let fallback = fearless_simd::Fallback::new(); + + let vals_a: [$elem_ty; $n_elems] = core::hint::black_box(core::array::from_fn(|i| (i + 1) as $elem_ty)); + let vals_b: [$elem_ty; $n_elems] = core::hint::black_box(core::array::from_fn(|i| (i + 1 + $n_elems) as $elem_ty)); + + let test_vec_a = $vec_ty::from_slice(simd, &vals_a); + let test_vec_b = $vec_ty::from_slice(simd, &vals_b); + let fallback_vec_a = <$vec_ty::>::from_slice(fallback, &vals_a); + let fallback_vec_b = <$vec_ty::>::from_slice(fallback, &vals_b); + + // Test vector-wide operations + test_slide_impl!(@$vec_n test_vec_a, test_vec_b, fallback_vec_a, fallback_vec_b, $vec_ty); + // Test within-block operations + test_slide_impl!(@$block_n test_vec_a, test_vec_b, fallback_vec_a, fallback_vec_b, $vec_ty); + } + }; +} + +// 128-bit vectors (block size == vector size, so within_blocks uses same range as vector-wide) +test_slide_exhaustive!(slide_exhaustive_f32x4, f32x4, f32, 4, vec4, block4); +test_slide_exhaustive!(slide_exhaustive_f64x2, f64x2, f64, 2, vec2, block2); +test_slide_exhaustive!(slide_exhaustive_i8x16, i8x16, i8, 16, vec16, block16); +test_slide_exhaustive!(slide_exhaustive_u8x16, u8x16, u8, 16, vec16, block16); +test_slide_exhaustive!(slide_exhaustive_i16x8, i16x8, i16, 8, vec8, block8); +test_slide_exhaustive!(slide_exhaustive_u16x8, u16x8, u16, 8, vec8, block8); +test_slide_exhaustive!(slide_exhaustive_i32x4, i32x4, i32, 4, vec4, block4); +test_slide_exhaustive!(slide_exhaustive_u32x4, u32x4, u32, 4, vec4, block4); + +// 256-bit vectors (block size = 128 bits = half the vector size) +test_slide_exhaustive!(slide_exhaustive_f32x8, f32x8, f32, 8, vec8, block4); +test_slide_exhaustive!(slide_exhaustive_f64x4, f64x4, f64, 4, vec4, block2); +test_slide_exhaustive!(slide_exhaustive_i8x32, i8x32, i8, 32, vec32, block16); +test_slide_exhaustive!(slide_exhaustive_u8x32, u8x32, u8, 32, vec32, block16); +test_slide_exhaustive!(slide_exhaustive_i16x16, i16x16, i16, 16, vec16, block8); +test_slide_exhaustive!(slide_exhaustive_u16x16, u16x16, u16, 16, vec16, block8); +test_slide_exhaustive!(slide_exhaustive_i32x8, i32x8, i32, 8, vec8, block4); +test_slide_exhaustive!(slide_exhaustive_u32x8, u32x8, u32, 8, vec8, block4); + +// 512-bit vectors (block size = 128 bits = quarter the vector size) +test_slide_exhaustive!(slide_exhaustive_f32x16, f32x16, f32, 16, vec16, block4); +test_slide_exhaustive!(slide_exhaustive_f64x8, f64x8, f64, 8, vec8, block2); +test_slide_exhaustive!(slide_exhaustive_i8x64, i8x64, i8, 64, vec64, block16); +test_slide_exhaustive!(slide_exhaustive_u8x64, u8x64, u8, 64, vec64, block16); +test_slide_exhaustive!(slide_exhaustive_i16x32, i16x32, i16, 32, vec32, block8); +test_slide_exhaustive!(slide_exhaustive_u16x32, u16x32, u16, 32, vec32, block8); +test_slide_exhaustive!(slide_exhaustive_i32x16, i32x16, i32, 16, vec16, block4); +test_slide_exhaustive!(slide_exhaustive_u32x16, u32x16, u32, 16, vec16, block4); + +// Mask types (128-bit) +test_slide_exhaustive!(slide_exhaustive_mask8x16, mask8x16, i8, 16, vec16, block16); +test_slide_exhaustive!(slide_exhaustive_mask16x8, mask16x8, i16, 8, vec8, block8); +test_slide_exhaustive!(slide_exhaustive_mask32x4, mask32x4, i32, 4, vec4, block4); +test_slide_exhaustive!(slide_exhaustive_mask64x2, mask64x2, i64, 2, vec2, block2); + +// Mask types (256-bit) +test_slide_exhaustive!(slide_exhaustive_mask8x32, mask8x32, i8, 32, vec32, block16); +test_slide_exhaustive!( + slide_exhaustive_mask16x16, + mask16x16, + i16, + 16, + vec16, + block8 +); +test_slide_exhaustive!(slide_exhaustive_mask32x8, mask32x8, i32, 8, vec8, block4); +test_slide_exhaustive!(slide_exhaustive_mask64x4, mask64x4, i64, 4, vec4, block2); + +// Mask types (512-bit) +test_slide_exhaustive!(slide_exhaustive_mask8x64, mask8x64, i8, 64, vec64, block16); +test_slide_exhaustive!( + slide_exhaustive_mask16x32, + mask16x32, + i16, + 32, + vec32, + block8 +); +test_slide_exhaustive!( + slide_exhaustive_mask32x16, + mask32x16, + i32, + 16, + vec16, + block4 +); +test_slide_exhaustive!(slide_exhaustive_mask64x8, mask64x8, i64, 8, vec8, block2); From f76e1331b97ba4757338241a18653fcc9f7a6204 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Fri, 12 Dec 2025 21:09:49 -0500 Subject: [PATCH 2/4] Autogenerate signatures for splat and slide --- fearless_simd/src/generated/simd_trait.rs | 169 ++++++++++------------ fearless_simd_gen/src/mk_simd_trait.rs | 37 ++--- fearless_simd_gen/src/ops.rs | 36 ++++- 3 files changed, 118 insertions(+), 124 deletions(-) diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index bad2db0e..09e65d41 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -136,9 +136,9 @@ pub trait Simd: fn cvt_from_bytes_f32x4(self, a: u8x16) -> f32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f32x4(self, a: f32x4) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_f32x4( self, a: f32x4, @@ -236,9 +236,9 @@ pub trait Simd: fn cvt_from_bytes_i8x16(self, a: u8x16) -> i8x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i8x16(self, a: i8x16) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i8x16( self, a: i8x16, @@ -314,9 +314,9 @@ pub trait Simd: fn cvt_from_bytes_u8x16(self, a: u8x16) -> u8x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u8x16(self, a: u8x16) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u8x16( self, a: u8x16, @@ -390,13 +390,13 @@ pub trait Simd: fn cvt_from_bytes_mask8x16(self, a: u8x16) -> mask8x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask8x16(self, a: mask8x16) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask8x16( self, a: mask8x16, b: mask8x16, ) -> mask8x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask8x16( self, a: mask8x16, @@ -445,9 +445,9 @@ pub trait Simd: fn cvt_from_bytes_i16x8(self, a: u8x16) -> i16x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i16x8(self, a: i16x8) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i16x8( self, a: i16x8, @@ -523,9 +523,9 @@ pub trait Simd: fn cvt_from_bytes_u16x8(self, a: u8x16) -> u16x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u16x8(self, a: u16x8) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u16x8( self, a: u16x8, @@ -599,13 +599,13 @@ pub trait Simd: fn cvt_from_bytes_mask16x8(self, a: u8x16) -> mask16x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask16x8(self, a: mask16x8) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask16x8( self, a: mask16x8, b: mask16x8, ) -> mask16x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask16x8( self, a: mask16x8, @@ -654,9 +654,9 @@ pub trait Simd: fn cvt_from_bytes_i32x4(self, a: u8x16) -> i32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i32x4(self, a: i32x4) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i32x4( self, a: i32x4, @@ -734,9 +734,9 @@ pub trait Simd: fn cvt_from_bytes_u32x4(self, a: u8x16) -> u32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u32x4(self, a: u32x4) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u32x4( self, a: u32x4, @@ -810,13 +810,13 @@ pub trait Simd: fn cvt_from_bytes_mask32x4(self, a: u8x16) -> mask32x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask32x4(self, a: mask32x4) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask32x4( self, a: mask32x4, b: mask32x4, ) -> mask32x4; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask32x4( self, a: mask32x4, @@ -865,9 +865,9 @@ pub trait Simd: fn cvt_from_bytes_f64x2(self, a: u8x16) -> f64x2; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f64x2(self, a: f64x2) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_f64x2( self, a: f64x2, @@ -951,13 +951,13 @@ pub trait Simd: fn cvt_from_bytes_mask64x2(self, a: u8x16) -> mask64x2; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask64x2(self, a: mask64x2) -> u8x16; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask64x2( self, a: mask64x2, b: mask64x2, ) -> mask64x2; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask64x2( self, a: mask64x2, @@ -1006,9 +1006,9 @@ pub trait Simd: fn cvt_from_bytes_f32x8(self, a: u8x32) -> f32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f32x8(self, a: f32x8) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_f32x8( self, a: f32x8, @@ -1108,9 +1108,9 @@ pub trait Simd: fn cvt_from_bytes_i8x32(self, a: u8x32) -> i8x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i8x32(self, a: i8x32) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i8x32( self, a: i8x32, @@ -1188,9 +1188,9 @@ pub trait Simd: fn cvt_from_bytes_u8x32(self, a: u8x32) -> u8x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u8x32(self, a: u8x32) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u8x32( self, a: u8x32, @@ -1266,13 +1266,13 @@ pub trait Simd: fn cvt_from_bytes_mask8x32(self, a: u8x32) -> mask8x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask8x32(self, a: mask8x32) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask8x32( self, a: mask8x32, b: mask8x32, ) -> mask8x32; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask8x32( self, a: mask8x32, @@ -1323,9 +1323,9 @@ pub trait Simd: fn cvt_from_bytes_i16x16(self, a: u8x32) -> i16x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i16x16(self, a: i16x16) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i16x16( self, a: i16x16, @@ -1403,9 +1403,9 @@ pub trait Simd: fn cvt_from_bytes_u16x16(self, a: u8x32) -> u16x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u16x16(self, a: u16x16) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u16x16( self, a: u16x16, @@ -1483,13 +1483,13 @@ pub trait Simd: fn cvt_from_bytes_mask16x16(self, a: u8x32) -> mask16x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask16x16(self, a: mask16x16) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask16x16( self, a: mask16x16, b: mask16x16, ) -> mask16x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask16x16( self, a: mask16x16, @@ -1540,9 +1540,9 @@ pub trait Simd: fn cvt_from_bytes_i32x8(self, a: u8x32) -> i32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i32x8(self, a: i32x8) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i32x8( self, a: i32x8, @@ -1622,9 +1622,9 @@ pub trait Simd: fn cvt_from_bytes_u32x8(self, a: u8x32) -> u32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u32x8(self, a: u32x8) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u32x8( self, a: u32x8, @@ -1700,13 +1700,13 @@ pub trait Simd: fn cvt_from_bytes_mask32x8(self, a: u8x32) -> mask32x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask32x8(self, a: mask32x8) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask32x8( self, a: mask32x8, b: mask32x8, ) -> mask32x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask32x8( self, a: mask32x8, @@ -1757,9 +1757,9 @@ pub trait Simd: fn cvt_from_bytes_f64x4(self, a: u8x32) -> f64x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f64x4(self, a: f64x4) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_f64x4( self, a: f64x4, @@ -1845,13 +1845,13 @@ pub trait Simd: fn cvt_from_bytes_mask64x4(self, a: u8x32) -> mask64x4; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask64x4(self, a: mask64x4) -> u8x32; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask64x4( self, a: mask64x4, b: mask64x4, ) -> mask64x4; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask64x4( self, a: mask64x4, @@ -1902,9 +1902,9 @@ pub trait Simd: fn cvt_from_bytes_f32x16(self, a: u8x64) -> f32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f32x16(self, a: f32x16) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_f32x16( self, a: f32x16, @@ -2006,9 +2006,9 @@ pub trait Simd: fn cvt_from_bytes_i8x64(self, a: u8x64) -> i8x64; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i8x64(self, a: i8x64) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i8x64( self, a: i8x64, @@ -2084,9 +2084,9 @@ pub trait Simd: fn cvt_from_bytes_u8x64(self, a: u8x64) -> u8x64; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u8x64(self, a: u8x64) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u8x64( self, a: u8x64, @@ -2162,13 +2162,13 @@ pub trait Simd: fn cvt_from_bytes_mask8x64(self, a: u8x64) -> mask8x64; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask8x64(self, a: mask8x64) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask8x64( self, a: mask8x64, b: mask8x64, ) -> mask8x64; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask8x64( self, a: mask8x64, @@ -2217,9 +2217,9 @@ pub trait Simd: fn cvt_from_bytes_i16x32(self, a: u8x64) -> i16x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i16x32(self, a: i16x32) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i16x32( self, a: i16x32, @@ -2295,9 +2295,9 @@ pub trait Simd: fn cvt_from_bytes_u16x32(self, a: u8x64) -> u16x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u16x32(self, a: u16x32) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u16x32( self, a: u16x32, @@ -2377,13 +2377,13 @@ pub trait Simd: fn cvt_from_bytes_mask16x32(self, a: u8x64) -> mask16x32; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask16x32(self, a: mask16x32) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask16x32( self, a: mask16x32, b: mask16x32, ) -> mask16x32; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask16x32( self, a: mask16x32, @@ -2432,9 +2432,9 @@ pub trait Simd: fn cvt_from_bytes_i32x16(self, a: u8x64) -> i32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_i32x16(self, a: i32x16) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_i32x16( self, a: i32x16, @@ -2512,9 +2512,9 @@ pub trait Simd: fn cvt_from_bytes_u32x16(self, a: u8x64) -> u32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_u32x16(self, a: u32x16) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_u32x16( self, a: u32x16, @@ -2592,13 +2592,13 @@ pub trait Simd: fn cvt_from_bytes_mask32x16(self, a: u8x64) -> mask32x16; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask32x16(self, a: mask32x16) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask32x16( self, a: mask32x16, b: mask32x16, ) -> mask32x16; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask32x16( self, a: mask32x16, @@ -2647,9 +2647,9 @@ pub trait Simd: fn cvt_from_bytes_f64x8(self, a: u8x64) -> f64x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_f64x8(self, a: f64x8) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_f64x8( self, a: f64x8, @@ -2733,13 +2733,13 @@ pub trait Simd: fn cvt_from_bytes_mask64x8(self, a: u8x64) -> mask64x8; #[doc = "Reinterpret a SIMD vector as a vector of bytes, with the equivalent byte length."] fn cvt_to_bytes_mask64x8(self, a: mask64x8) -> u8x64; - #[doc = ""] + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide_mask64x8( self, a: mask64x8, b: mask64x8, ) -> mask64x8; - #[doc = ""] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks_mask64x8( self, a: mask64x8, @@ -2859,8 +2859,6 @@ pub trait SimdBase: #[doc = r""] #[doc = r" The slice must be the proper width."] fn from_slice(simd: S, slice: &[Self::Element]) -> Self; - #[doc = r" Create a SIMD vector with all elements set to the given value."] - fn splat(simd: S, val: Self::Element) -> Self; #[doc = r" Create a SIMD vector from a 128-bit vector of the same scalar"] #[doc = r" type, repeated."] fn block_splat(block: Self::Block) -> Self; @@ -2868,26 +2866,11 @@ pub trait SimdBase: #[doc = r" calling `f` with that element's lane index (from 0 to"] #[doc = r" [`SimdBase::N`] - 1)."] fn from_fn(simd: S, f: impl FnMut(usize) -> Self::Element) -> Self; - #[doc = r" Concatenate `[self, rhs]` and extract `Self::N` elements"] - #[doc = r" starting at index `SHIFT`."] - #[doc = r""] - #[doc = r" `SHIFT` must be within [0, `Self::N`]."] - #[doc = r""] - #[doc = r#" This can be used to implement a "shift items" operation by"#] - #[doc = r" providing all zeroes as one operand. For a left shift, the"] - #[doc = r" right-hand side should be all zeroes. For a right shift by `M`"] - #[doc = r" items, the left-hand side should be all zeroes, and the shift"] - #[doc = r" amount will be `Self::N - M`."] - #[doc = r""] - #[doc = r" This can also be used to rotate items within a vector by"] - #[doc = r" providing the same vector as both operands."] - #[doc = r""] - #[doc = r" ```text"] - #[doc = r" slide::<1>([a b c d], [e f g h]) == [b c d e]"] - #[doc = r" ```"] + #[doc = "Create a SIMD vector with all elements set to the given value."] + fn splat(simd: S, val: Self::Element) -> Self; + #[doc = "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n`SHIFT` must be within [0, `Self::N`].\n\nThis can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\nThis can also be used to rotate items within a vector by providing the same vector as both operands.\n\n```text\n\nslide::<1>([a b c d], [e f g h]) == [b c d e]\n\n```"] fn slide(self, rhs: impl SimdInto) -> Self; - #[doc = r" Like [`slide`](SimdBase::slide), but operates independently on"] - #[doc = r" each 128-bit block."] + #[doc = "Like `slide`, but operates independently on each 128-bit block."] fn slide_within_blocks(self, rhs: impl SimdInto) -> Self; } #[doc = r" Functionality implemented by floating-point SIMD vectors."] diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs index b62a5ffb..8c235b52 100644 --- a/fearless_simd_gen/src/mk_simd_trait.rs +++ b/fearless_simd_gen/src/mk_simd_trait.rs @@ -5,7 +5,7 @@ use proc_macro2::TokenStream; use quote::quote; use crate::{ - ops::{OpKind, TyFlavor, ops_for_type, overloaded_ops_for, vec_trait_ops_for}, + ops::{OpKind, TyFlavor, base_trait_ops, ops_for_type, overloaded_ops_for, vec_trait_ops_for}, types::{SIMD_TYPES, ScalarType, type_imports}, }; @@ -130,6 +130,17 @@ pub(crate) fn mk_arch_types() -> TokenStream { } fn mk_simd_base() -> TokenStream { + let mut methods = vec![]; + for op in base_trait_ops() { + let doc = op.format_docstring(TyFlavor::VecImpl); + if let Some(method_sig) = op.vec_trait_method_sig() { + methods.push(quote! { + #[doc = #doc] + #method_sig; + }); + } + } + quote! { /// Base functionality implemented by all SIMD vectors. pub trait SimdBase: @@ -167,8 +178,6 @@ fn mk_simd_base() -> TokenStream { /// /// The slice must be the proper width. fn from_slice(simd: S, slice: &[Self::Element]) -> Self; - /// Create a SIMD vector with all elements set to the given value. - fn splat(simd: S, val: Self::Element) -> Self; /// Create a SIMD vector from a 128-bit vector of the same scalar /// type, repeated. fn block_splat(block: Self::Block) -> Self; @@ -177,27 +186,7 @@ fn mk_simd_base() -> TokenStream { /// [`SimdBase::N`] - 1). fn from_fn(simd: S, f: impl FnMut(usize) -> Self::Element) -> Self; - /// Concatenate `[self, rhs]` and extract `Self::N` elements - /// starting at index `SHIFT`. - /// - /// `SHIFT` must be within [0, `Self::N`]. - /// - /// This can be used to implement a "shift items" operation by - /// providing all zeroes as one operand. For a left shift, the - /// right-hand side should be all zeroes. For a right shift by `M` - /// items, the left-hand side should be all zeroes, and the shift - /// amount will be `Self::N - M`. - /// - /// This can also be used to rotate items within a vector by - /// providing the same vector as both operands. - /// - /// ```text - /// slide::<1>([a b c d], [e f g h]) == [b c d e] - /// ``` - fn slide(self, rhs: impl SimdInto) -> Self; - /// Like [`slide`](SimdBase::slide), but operates independently on - /// each 128-bit block. - fn slide_within_blocks(self, rhs: impl SimdInto) -> Self; + #( #methods )* } } } diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index e637ef7c..cf1f7e9a 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -310,7 +310,12 @@ impl Op { .collect::>(); let method_ident = Ident::new(self.method, Span::call_site()); let sig_inner = match &self.sig { - OpSig::Splat | OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } => { + OpSig::Splat => { + let arg0 = &arg_names[0]; + let arg1 = &arg_names[1]; + quote! { (#arg0: S, #arg1: Self::Element) -> Self } + } + OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } => { return None; } OpSig::Unary @@ -470,7 +475,13 @@ const BASE_OPS: &[Op] = &[ OpSig::Slide { granularity: SlideGranularity::AcrossBlocks, }, - "", + "Concatenate `[self, rhs]` and extract `Self::N` elements starting at index `SHIFT`.\n\n\ + `SHIFT` must be within [0, `Self::N`].\n\n\ + This can be used to implement a \"shift items\" operation by providing all zeroes as one operand. For a left shift, the right-hand side should be all zeroes. For a right shift by `M` items, the left-hand side should be all zeroes, and the shift amount will be `Self::N - M`.\n\n\ + This can also be used to rotate items within a vector by providing the same vector as both operands.\n\n\ + ```text\n\n\ + slide::<1>([a b c d], [e f g h]) == [b c d e]\n\n\ + ```", ), Op::new( "slide_within_blocks", @@ -478,7 +489,7 @@ const BASE_OPS: &[Op] = &[ OpSig::Slide { granularity: SlideGranularity::WithinBlocks, }, - "", + "Like `slide`, but operates independently on each 128-bit block.", ), ]; @@ -944,6 +955,14 @@ const MASK_OPS: &[Op] = &[ ), ]; +pub(crate) fn base_trait_ops() -> Vec { + BASE_OPS + .iter() + .filter(|op| matches!(op.kind, OpKind::BaseTraitMethod)) + .copied() + .collect() +} + pub(crate) fn vec_trait_ops_for(scalar: ScalarType) -> Vec { let base = match scalar { ScalarType::Float => FLOAT_OPS, @@ -1367,8 +1386,8 @@ impl OpSig { } fn vec_trait_arg_names(&self) -> &'static [&'static str] { match self { - Self::Splat - | Self::LoadInterleaved { .. } + Self::Splat => &["simd", "val"], + Self::LoadInterleaved { .. } | Self::StoreInterleaved { .. } | Self::FromArray { .. } | Self::FromBytes { .. } => &[], @@ -1397,6 +1416,10 @@ impl OpSig { .map(|n| Ident::new(n, Span::call_site())) .collect::>(); let args = match self { + Self::Splat => { + let arg1 = &arg_names[1]; + quote! { #arg1 } + } Self::Unary | Self::MaskReduce { .. } => { let arg0 = &arg_names[0]; quote! { #arg0 } @@ -1416,8 +1439,7 @@ impl OpSig { let arg2 = &arg_names[2]; quote! { #arg0, #arg1.simd_into(self.simd), #arg2.simd_into(self.simd) } } - Self::Splat - | Self::Select + Self::Select | Self::Split { .. } | Self::Cvt { .. } | Self::Reinterpret { .. } From 52adf73f111e03991b10a4ddcdc880a8b0078d10 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Sat, 20 Dec 2025 19:47:17 -0500 Subject: [PATCH 3/4] Comment x86 dyn_alignr_helpers --- fearless_simd_gen/src/mk_x86.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index b89a02d3..e348f304 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -1563,6 +1563,10 @@ impl X86 { } } + /// Generates versions of the "alignr" intrinsics that take the shift amount as a regular argument instead of a + /// const generic argument, to make them easier to use in higher-level operations. These are low-level helpers that + /// inherit the semantics of the underlying `alignr` intrinsics, so the argument order is backwards from ARM's + /// `vext` and our `slide` operation, and the 256-bit AVX2 version still operates *within* 128-bit lanes. fn dyn_alignr_helpers(&self) -> TokenStream { let mut fns = vec![]; From 99d542d93f536cb40332ce152132dd010fcfe906 Mon Sep 17 00:00:00 2001 From: valadaptive Date: Sat, 20 Dec 2025 19:49:46 -0500 Subject: [PATCH 4/4] Fix Clippy warnings --- fearless_simd/src/generated/avx2.rs | 4 ++-- fearless_simd_gen/src/mk_x86.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 6503426d..fbd2f201 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -8417,13 +8417,13 @@ unsafe fn cross_block_alignr_one( ) -> __m256i { let lo_idx = block_idx + (shift_bytes / 16); let intra_shift = shift_bytes % 16; - let lo_blocks = if lo_idx % 2 == 0 { + let lo_blocks = if lo_idx & 1 == 0 { regs[lo_idx / 2] } else { unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } }; let hi_idx = lo_idx + 1; - let hi_blocks = if hi_idx % 2 == 0 { + let hi_blocks = if hi_idx & 1 == 0 { regs[hi_idx / 2] } else { unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) } diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index e348f304..bec30435 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -85,8 +85,8 @@ impl Level for X86 { fn make_module_footer(&self) -> TokenStream { let alignr_helpers = self.dyn_alignr_helpers(); let slide_helpers = match self { - X86::Sse4_2 => Self::sse42_slide_helpers(), - X86::Avx2 => Self::avx2_slide_helpers(), + Self::Sse4_2 => Self::sse42_slide_helpers(), + Self::Avx2 => Self::avx2_slide_helpers(), }; quote! { @@ -1571,8 +1571,8 @@ impl X86 { let mut fns = vec![]; let vec_widths: &[usize] = match self { - X86::Sse4_2 => &[128], - X86::Avx2 => &[128, 256], + Self::Sse4_2 => &[128], + Self::Avx2 => &[128, 256], }; for vec_ty in vec_widths @@ -1644,7 +1644,7 @@ impl X86 { unsafe fn cross_block_alignr_one(regs: &[__m256i], block_idx: usize, shift_bytes: usize) -> __m256i { let lo_idx = block_idx + (shift_bytes / 16); let intra_shift = shift_bytes % 16; - let lo_blocks = if lo_idx % 2 == 0 { + let lo_blocks = if lo_idx & 1 == 0 { regs[lo_idx / 2] } else { unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } @@ -1652,7 +1652,7 @@ impl X86 { // For hi_blocks, we need blocks (`lo_idx + 1`) and (`lo_idx + 2`) let hi_idx = lo_idx + 1; - let hi_blocks = if hi_idx % 2 == 0 { + let hi_blocks = if hi_idx & 1 == 0 { regs[hi_idx / 2] } else { unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) }