linebender · dbuch · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025 · Sep 14, 2025
diff --git a/fearless_simd/src/traits.rs b/fearless_simd/src/traits.rs
@@ -119,6 +119,10 @@ impl SimdElement for i64 {
     type Mask = i64;
 }
 
+impl SimdElement for u64 {
+    type Mask = i64;
+}
+
 /// Construction of integer vectors from floats by truncation
 pub trait SimdCvtTruncate<T> {
     fn truncate_from(x: T) -> Self;

diff --git a/fearless_simd_gen/src/mk_avx2.rs b/fearless_simd_gen/src/mk_avx2.rs
@@ -112,9 +112,12 @@ fn mk_simd_impl() -> TokenStream {
             type i16s = i16x8<Self>;
             type u32s = u32x4<Self>;
             type i32s = i32x4<Self>;
+            type u64s = u64x2<Self>;
+            type i64s = i64x2<Self>;
             type mask8s = mask8x16<Self>;
             type mask16s = mask16x8<Self>;
             type mask32s = mask32x4<Self>;
+            type mask64s = mask64x2<Self>;
             #[inline(always)]
             fn level(self) -> Level {
                 Level::#level_tok(self)

diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
@@ -402,9 +402,12 @@ fn mk_simd_impl() -> TokenStream {
             type i16s = i16x8<Self>;
             type u32s = u32x4<Self>;
             type i32s = i32x4<Self>;
+            type u64s = u64x2<Self>;
+            type i64s = i64x2<Self>;
             type mask8s = mask8x16<Self>;
             type mask16s = mask16x8<Self>;
             type mask32s = mask32x4<Self>;
+            type mask64s = mask64x2<Self>;
             #[inline(always)]
             fn level(self) -> Level {
                 Level::#level_tok(self)

diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
@@ -409,9 +409,12 @@ fn mk_simd_impl(level: Level) -> TokenStream {
             type i16s = i16x8<Self>;
             type u32s = u32x4<Self>;
             type i32s = i32x4<Self>;
+            type u64s = u64x2<Self>;
+            type i64s = i64x2<Self>;
             type mask8s = mask8x16<Self>;
             type mask16s = mask16x8<Self>;
             type mask32s = mask32x4<Self>;
+            type mask64s = mask64x2<Self>;
             #[inline(always)]
             fn level(self) -> Level {
                 Level::#level_tok(self)

diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -44,10 +44,12 @@ pub fn mk_simd_trait() -> TokenStream {
             type u32s: SimdInt<u32, Self, Block = u32x4<Self>, Mask = Self::mask32s> + SimdCvtTruncate<Self::f32s>;
             type i32s: SimdInt<i32, Self, Block = i32x4<Self>, Mask = Self::mask32s, Bytes = <Self::u32s as Bytes>::Bytes> + SimdCvtTruncate<Self::f32s>
                 + core::ops::Neg<Output = Self::i32s>;
+            type u64s: SimdInt<u64, Self, Block = u64x2<Self>, Mask = Self::mask64s>; // + SimdCvtTruncate<Self::f64s>;
+            type i64s: SimdInt<i64, Self, Block = i64x2<Self>, Mask = Self::mask64s, Bytes = <Self::u64s as Bytes>::Bytes> + core::ops::Neg<Output = Self::i64s>; // + SimdCvtTruncate<Self::f64s>;
             type mask8s: SimdMask<i8, Self, Block = mask8x16<Self>, Bytes = <Self::u8s as Bytes>::Bytes> + Select<Self::u8s> + Select<Self::i8s> + Select<Self::mask8s>;
             type mask16s: SimdMask<i16, Self, Block = mask16x8<Self>, Bytes = <Self::u16s as Bytes>::Bytes> + Select<Self::u16s> + Select<Self::i16s> + Select<Self::mask16s>;
-            type mask32s: SimdMask<i32, Self, Block = mask32x4<Self>, Bytes = <Self::u32s as Bytes>::Bytes>
-                + Select<Self::f32s> + Select<Self::u32s> + Select<Self::i32s> + Select<Self::mask32s>;
+            type mask32s: SimdMask<i32, Self, Block = mask32x4<Self>, Bytes = <Self::u32s as Bytes>::Bytes> + Select<Self::f32s> + Select<Self::u32s> + Select<Self::i32s> + Select<Self::mask32s>;
+            type mask64s: SimdMask<i64, Self, Block = mask64x2<Self>, Bytes = <Self::u64s as Bytes>::Bytes> + Select<Self::u64s> + Select<Self::i64s> + Select<Self::mask64s>; // + Select<Self::f64s>
             fn level(self) -> Level;
 
             /// Call function with CPU features enabled.

diff --git a/fearless_simd_gen/src/mk_sse4_2.rs b/fearless_simd_gen/src/mk_sse4_2.rs
@@ -110,9 +110,12 @@ fn mk_simd_impl() -> TokenStream {
             type i16s = i16x8<Self>;
             type u32s = u32x4<Self>;
             type i32s = i32x4<Self>;
+            type u64s = u64x2<Self>;
+            type i64s = i64x2<Self>;
             type mask8s = mask8x16<Self>;
             type mask16s = mask16x8<Self>;
             type mask32s = mask32x4<Self>;
+            type mask64s = mask64x2<Self>;
             #[inline(always)]
             fn level(self) -> Level {
                 Level::#level_tok(self)
@@ -258,13 +261,18 @@ pub(crate) fn handle_compare(
 
             let max_min_expr = arch.expr(max_min, vec_ty, &args);
             quote! { #eq_intrinsic(#max_min_expr, a.into()) }
+        } else if matches!(method, "simd_eq") && vec_ty.scalar_bits == 64 {
+            let eq =
+                simple_sign_unaware_intrinsic("cmpeq", vec_ty.scalar, vec_ty.scalar_bits, ty_bits);
+            quote! { #eq(a.into(), b.into()) }
         } else if vec_ty.scalar == ScalarType::Unsigned {
             // SSE4.2 only has signed GT/LT, but not unsigned.
             let set = set1_intrinsic(vec_ty.scalar, vec_ty.scalar_bits, ty_bits);
             let sign = match vec_ty.scalar_bits {
                 8 => quote! { 0x80u8 },
                 16 => quote! { 0x8000u16 },
                 32 => quote! { 0x80000000u32 },
+                64 => quote! { 0x8000000000000000u64 },
                 _ => unimplemented!(),
             };
             let gt =
@@ -282,10 +290,29 @@ pub(crate) fn handle_compare(
 
                 #gt(#args)
             }
+        } else if vec_ty.scalar_bits == 64 {
+            let intrinsic_name = if matches!(method, "simd_eq") {
+                "cmpeq"
+            } else {
+                "cmpgt"
+            };
+
+            let cmp = simple_intrinsic(intrinsic_name, vec_ty.scalar, vec_ty.scalar_bits, ty_bits);
+            // SSE4.2 only has signed GT for i64
+            let args = if method == "simd_lt" {
+                quote! { b.into(), a.into() }
+            } else {
+                quote! { a.into(), b.into() }
+            };
+
+            quote! {
+                #cmp(#args)
+            }
         } else {
             arch.expr(method, vec_ty, &args)
         }
     } else {
+        // Floating point comparison
         arch.expr(method, vec_ty, &args)
     };
 
@@ -596,6 +623,16 @@ pub(crate) fn handle_unzip(
         quote! { unsafe { #intrinsic::<#mask>(a.into(), b.into()).simd_into(self) } }
     } else {
         match vec_ty.scalar_bits {
+            64 => {
+                let op = if select_even { "lo" } else { "hi" };
+                let intrinsic = format_ident!("_mm_unpack{op}_epi64");
+
+                quote! {
+                    unsafe {
+                        #intrinsic(a.into(), b.into()).simd_into(self)
+                    }
+                }
+            }
             32 => {
                 let op = if select_even { "lo" } else { "hi" };
 

diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
@@ -64,6 +64,7 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                 #[inline(always)]
                 fn #method_ident(#args) -> #ret_ty
             };
+
             let m = match sig {
                 OpSig::Splat => {
                     let expr = Wasm.expr(method, vec_ty, &[quote! { val }]);
@@ -118,6 +119,45 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                 OpSig::Binary => {
                     let args = [quote! { a.into() }, quote! { b.into() }];
                     match method {
+                        "max" | "min" if vec_ty.scalar_bits == 64 && vec_ty.len == 2 => {
+                            let is_max = method == "max";
+
+                            let xor_for_unsigned = if vec_ty.scalar == ScalarType::Unsigned {
+                                quote! {
+                                    let sign_bit = i64x2_splat(0x8000_0000_0000_0000u64 as i64);
+                                    let a_signed = v128_xor(a.into(), sign_bit);
+                                    let b_signed = v128_xor(b.into(), sign_bit);
+                                }
+                            } else {
+                                quote! {
+                                    let a_signed = a.into();
+                                    let b_signed = b.into();
+                                }
+                            };
+
+                            let body = if is_max {
+                                quote! {
+                                    let mask = i64x2_gt(a_signed, b_signed);
+                                    let a_masked = v128_and(mask, a.into());
+                                    let b_masked = v128_andnot(mask, b.into());
+                                    v128_or(a_masked, b_masked)
+                                }
+                            } else {
+                                quote! {
+                                    let mask = i64x2_gt(a_signed, b_signed);
+                                    let a_masked = v128_andnot(mask, a.into());
+                                    let b_masked = v128_and(mask, b.into());
+                                    v128_or(a_masked, b_masked)
+                                }
+                            };
+
+                            quote! {
+                                #method_sig {
+                                    #xor_for_unsigned
+                                    #body.simd_into(self)
+                                }
+                            }
+                        }
                         "mul" if vec_ty.scalar_bits == 8 && vec_ty.len == 16 => {
                             let (extmul_low, extmul_high) = match vec_ty.scalar {
                                 ScalarType::Unsigned => (
@@ -183,9 +223,31 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                 OpSig::Compare => {
                     let args = [quote! { a.into() }, quote! { b.into() }];
                     let expr = Wasm.expr(method, vec_ty, &args);
-                    quote! {
-                        #method_sig {
-                            #expr.simd_into(self)
+
+                    let missing_op = ["lt", "gt", "le", "ge"]
+                        .iter()
+                        .find(|&op| method.ends_with(op));
+
+                    if vec_ty.scalar_bits == 64
+                        && vec_ty.scalar == ScalarType::Unsigned
+                        && missing_op.is_some()
+                    {
+                        let op = missing_op.unwrap();
+                        let wasm_ident = format_ident!("i64x2_{}", op);
+                        quote! {
+                            #method_sig {
+                                let sign_bit = i64x2_splat(0x8000_0000_0000_0000u64 as i64);
+                                let a_signed = v128_xor(a.into(), sign_bit);
+                                let b_signed = v128_xor(b.into(), sign_bit);
+
+                                #wasm_ident(a_signed, b_signed).simd_into(self)
+                            }
+                        }
+                    } else {
+                        quote! {
+                            #method_sig {
+                                #expr.simd_into(self)
+                            }
                         }
                     }
                 }
@@ -386,6 +448,13 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                             quote! { 2, 3, 6, 7 },
                             quote! { u32x4_shuffle },
                         ),
+                        64 => (
+                            quote! { 0, 2 },
+                            quote! { 1, 3 },
+                            quote! { 0, 1 },
+                            quote! { 2, 3 },
+                            quote! { u64x2_shuffle },
+                        ),
                         _ => panic!("unsupported scalar_bits"),
                     };
 
@@ -455,6 +524,7 @@ fn mk_simd_impl(level: Level) -> TokenStream {
                             quote! { 2, 6, 3, 7 },
                             quote! { u32x4_shuffle },
                         ),
+                        64 => (quote! { 0, 2 }, quote! { 1, 3 }, quote! { u64x2_shuffle }),
                         _ => panic!("unsupported scalar_bits"),
                     };
 
@@ -526,9 +596,12 @@ fn mk_simd_impl(level: Level) -> TokenStream {
             type i16s = i16x8<Self>;
             type u32s = u32x4<Self>;
             type i32s = i32x4<Self>;
+            type u64s = u64x2<Self>;
+            type i64s = i64x2<Self>;
             type mask8s = mask8x16<Self>;
             type mask16s = mask16x8<Self>;
             type mask32s = mask32x4<Self>;
+            type mask64s = mask64x2<Self>;
 
             #[inline(always)]
             fn level(self) -> Level {

diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs
@@ -116,6 +116,8 @@ pub const SIMD_TYPES: &[VecType] = &[
     VecType::new(ScalarType::Mask, 32, 4),
     VecType::new(ScalarType::Float, 64, 2),
     VecType::new(ScalarType::Mask, 64, 2),
+    VecType::new(ScalarType::Int, 64, 2),
+    VecType::new(ScalarType::Unsigned, 64, 2),
     // 256 bit types
     VecType::new(ScalarType::Float, 32, 8),
     VecType::new(ScalarType::Int, 8, 32),
@@ -129,6 +131,8 @@ pub const SIMD_TYPES: &[VecType] = &[
     VecType::new(ScalarType::Mask, 32, 8),
     VecType::new(ScalarType::Float, 64, 4),
     VecType::new(ScalarType::Mask, 64, 4),
+    VecType::new(ScalarType::Int, 64, 4),
+    VecType::new(ScalarType::Unsigned, 64, 4),
     // 512 bit types
     VecType::new(ScalarType::Float, 32, 16),
     VecType::new(ScalarType::Int, 8, 64),
@@ -142,6 +146,8 @@ pub const SIMD_TYPES: &[VecType] = &[
     VecType::new(ScalarType::Mask, 32, 16),
     VecType::new(ScalarType::Float, 64, 8),
     VecType::new(ScalarType::Mask, 64, 8),
+    VecType::new(ScalarType::Int, 64, 8),
+    VecType::new(ScalarType::Unsigned, 64, 8),
 ];
 
 pub fn type_imports() -> TokenStream {