From 0387e997ec188205dbc21028a52897e4d4dc6701 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 30 Jan 2026 20:22:18 +0100
Subject: [PATCH 1/4] test the `vld1*` functions

---
 crates/core_arch/src/aarch64/neon/mod.rs | 864 +++++++++++++++++++++++
 1 file changed, 864 insertions(+)

diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index bac4574239..feaf94a7f9 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -993,6 +993,870 @@ mod tests {
         assert_eq!(vals[1], 1.);
         assert_eq!(vals[2], 2.);
     }
+
+    #[simd_test(enable = "neon,fp16")]
+    #[cfg(not(target_arch = "arm64ec"))]
+    unsafe fn test_vld1_f16_x2() {
+        let vals: [f16; 8] = crate::array::from_fn(|i| i as f16);
+        let a: float16x4x2_t = transmute(vals);
+        let mut tmp = [0_f16; 8];
+        vst1_f16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: float16x4x2_t = vld1_f16_x2(tmp.as_ptr().cast());
+        let out: [f16; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,fp16")]
+    #[cfg(not(target_arch = "arm64ec"))]
+    unsafe fn test_vld1_f16_x3() {
+        let vals: [f16; 12] = crate::array::from_fn(|i| i as f16);
+        let a: float16x4x3_t = transmute(vals);
+        let mut tmp = [0_f16; 12];
+        vst1_f16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: float16x4x3_t = vld1_f16_x3(tmp.as_ptr().cast());
+        let out: [f16; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,fp16")]
+    #[cfg(not(target_arch = "arm64ec"))]
+    unsafe fn test_vld1_f16_x4() {
+        let vals: [f16; 16] = crate::array::from_fn(|i| i as f16);
+        let a: float16x4x4_t = transmute(vals);
+        let mut tmp = [0_f16; 16];
+        vst1_f16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: float16x4x4_t = vld1_f16_x4(tmp.as_ptr().cast());
+        let out: [f16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,fp16")]
+    #[cfg(not(target_arch = "arm64ec"))]
+    unsafe fn test_vld1q_f16_x2() {
+        let vals: [f16; 16] = crate::array::from_fn(|i| i as f16);
+        let a: float16x8x2_t = transmute(vals);
+        let mut tmp = [0_f16; 16];
+        vst1q_f16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: float16x8x2_t = vld1q_f16_x2(tmp.as_ptr().cast());
+        let out: [f16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,fp16")]
+    #[cfg(not(target_arch = "arm64ec"))]
+    unsafe fn test_vld1q_f16_x3() {
+        let vals: [f16; 24] = crate::array::from_fn(|i| i as f16);
+        let a: float16x8x3_t = transmute(vals);
+        let mut tmp = [0_f16; 24];
+        vst1q_f16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: float16x8x3_t = vld1q_f16_x3(tmp.as_ptr().cast());
+        let out: [f16; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,fp16")]
+    #[cfg(not(target_arch = "arm64ec"))]
+    unsafe fn test_vld1q_f16_x4() {
+        let vals: [f16; 32] = crate::array::from_fn(|i| i as f16);
+        let a: float16x8x4_t = transmute(vals);
+        let mut tmp = [0_f16; 32];
+        vst1q_f16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: float16x8x4_t = vld1q_f16_x4(tmp.as_ptr().cast());
+        let out: [f16; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f32_x2() {
+        let vals: [f32; 4] = crate::array::from_fn(|i| i as f32);
+        let a: float32x2x2_t = transmute(vals);
+        let mut tmp = [0_f32; 4];
+        vst1_f32_x2(tmp.as_mut_ptr().cast(), a);
+        let r: float32x2x2_t = vld1_f32_x2(tmp.as_ptr().cast());
+        let out: [f32; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f32_x3() {
+        let vals: [f32; 6] = crate::array::from_fn(|i| i as f32);
+        let a: float32x2x3_t = transmute(vals);
+        let mut tmp = [0_f32; 6];
+        vst1_f32_x3(tmp.as_mut_ptr().cast(), a);
+        let r: float32x2x3_t = vld1_f32_x3(tmp.as_ptr().cast());
+        let out: [f32; 6] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_f32_x4() {
+        let vals: [f32; 8] = crate::array::from_fn(|i| i as f32);
+        let a: float32x2x4_t = transmute(vals);
+        let mut tmp = [0_f32; 8];
+        vst1_f32_x4(tmp.as_mut_ptr().cast(), a);
+        let r: float32x2x4_t = vld1_f32_x4(tmp.as_ptr().cast());
+        let out: [f32; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f32_x2() {
+        let vals: [f32; 8] = crate::array::from_fn(|i| i as f32);
+        let a: float32x4x2_t = transmute(vals);
+        let mut tmp = [0_f32; 8];
+        vst1q_f32_x2(tmp.as_mut_ptr().cast(), a);
+        let r: float32x4x2_t = vld1q_f32_x2(tmp.as_ptr().cast());
+        let out: [f32; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f32_x3() {
+        let vals: [f32; 12] = crate::array::from_fn(|i| i as f32);
+        let a: float32x4x3_t = transmute(vals);
+        let mut tmp = [0_f32; 12];
+        vst1q_f32_x3(tmp.as_mut_ptr().cast(), a);
+        let r: float32x4x3_t = vld1q_f32_x3(tmp.as_ptr().cast());
+        let out: [f32; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_f32_x4() {
+        let vals: [f32; 16] = crate::array::from_fn(|i| i as f32);
+        let a: float32x4x4_t = transmute(vals);
+        let mut tmp = [0_f32; 16];
+        vst1q_f32_x4(tmp.as_mut_ptr().cast(), a);
+        let r: float32x4x4_t = vld1q_f32_x4(tmp.as_ptr().cast());
+        let out: [f32; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_p64_x2() {
+        let vals: [p64; 2] = crate::array::from_fn(|i| i as p64);
+        let a: poly64x1x2_t = transmute(vals);
+        let mut tmp = [0 as p64; 2];
+        vst1_p64_x2(tmp.as_mut_ptr().cast(), a);
+        let r: poly64x1x2_t = vld1_p64_x2(tmp.as_ptr().cast());
+        let out: [p64; 2] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_p64_x3() {
+        let vals: [p64; 3] = crate::array::from_fn(|i| i as p64);
+        let a: poly64x1x3_t = transmute(vals);
+        let mut tmp = [0 as p64; 3];
+        vst1_p64_x3(tmp.as_mut_ptr().cast(), a);
+        let r: poly64x1x3_t = vld1_p64_x3(tmp.as_ptr().cast());
+        let out: [p64; 3] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1_p64_x4() {
+        let vals: [p64; 4] = crate::array::from_fn(|i| i as p64);
+        let a: poly64x1x4_t = transmute(vals);
+        let mut tmp = [0 as p64; 4];
+        vst1_p64_x4(tmp.as_mut_ptr().cast(), a);
+        let r: poly64x1x4_t = vld1_p64_x4(tmp.as_ptr().cast());
+        let out: [p64; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_p64_x2() {
+        let vals: [p64; 4] = crate::array::from_fn(|i| i as p64);
+        let a: poly64x2x2_t = transmute(vals);
+        let mut tmp = [0 as p64; 4];
+        vst1q_p64_x2(tmp.as_mut_ptr().cast(), a);
+        let r: poly64x2x2_t = vld1q_p64_x2(tmp.as_ptr().cast());
+        let out: [p64; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_p64_x3() {
+        let vals: [p64; 6] = crate::array::from_fn(|i| i as p64);
+        let a: poly64x2x3_t = transmute(vals);
+        let mut tmp = [0 as p64; 6];
+        vst1q_p64_x3(tmp.as_mut_ptr().cast(), a);
+        let r: poly64x2x3_t = vld1q_p64_x3(tmp.as_ptr().cast());
+        let out: [p64; 6] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon,aes")]
+    unsafe fn test_vld1q_p64_x4() {
+        let vals: [p64; 8] = crate::array::from_fn(|i| i as p64);
+        let a: poly64x2x4_t = transmute(vals);
+        let mut tmp = [0 as p64; 8];
+        vst1q_p64_x4(tmp.as_mut_ptr().cast(), a);
+        let r: poly64x2x4_t = vld1q_p64_x4(tmp.as_ptr().cast());
+        let out: [p64; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s8_x2() {
+        let vals: [i8; 16] = crate::array::from_fn(|i| i as i8);
+        let a: int8x8x2_t = transmute(vals);
+        let mut tmp = [0_i8; 16];
+        vst1_s8_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int8x8x2_t = vld1_s8_x2(tmp.as_ptr().cast());
+        let out: [i8; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s8_x3() {
+        let vals: [i8; 24] = crate::array::from_fn(|i| i as i8);
+        let a: int8x8x3_t = transmute(vals);
+        let mut tmp = [0_i8; 24];
+        vst1_s8_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int8x8x3_t = vld1_s8_x3(tmp.as_ptr().cast());
+        let out: [i8; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s8_x4() {
+        let vals: [i8; 32] = crate::array::from_fn(|i| i as i8);
+        let a: int8x8x4_t = transmute(vals);
+        let mut tmp = [0_i8; 32];
+        vst1_s8_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int8x8x4_t = vld1_s8_x4(tmp.as_ptr().cast());
+        let out: [i8; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s8_x2() {
+        let vals: [i8; 32] = crate::array::from_fn(|i| i as i8);
+        let a: int8x16x2_t = transmute(vals);
+        let mut tmp = [0_i8; 32];
+        vst1q_s8_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int8x16x2_t = vld1q_s8_x2(tmp.as_ptr().cast());
+        let out: [i8; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s8_x3() {
+        let vals: [i8; 48] = crate::array::from_fn(|i| i as i8);
+        let a: int8x16x3_t = transmute(vals);
+        let mut tmp = [0_i8; 48];
+        vst1q_s8_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int8x16x3_t = vld1q_s8_x3(tmp.as_ptr().cast());
+        let out: [i8; 48] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s8_x4() {
+        let vals: [i8; 64] = crate::array::from_fn(|i| i as i8);
+        let a: int8x16x4_t = transmute(vals);
+        let mut tmp = [0_i8; 64];
+        vst1q_s8_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int8x16x4_t = vld1q_s8_x4(tmp.as_ptr().cast());
+        let out: [i8; 64] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s16_x2() {
+        let vals: [i16; 8] = crate::array::from_fn(|i| i as i16);
+        let a: int16x4x2_t = transmute(vals);
+        let mut tmp = [0_i16; 8];
+        vst1_s16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int16x4x2_t = vld1_s16_x2(tmp.as_ptr().cast());
+        let out: [i16; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s16_x3() {
+        let vals: [i16; 12] = crate::array::from_fn(|i| i as i16);
+        let a: int16x4x3_t = transmute(vals);
+        let mut tmp = [0_i16; 12];
+        vst1_s16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int16x4x3_t = vld1_s16_x3(tmp.as_ptr().cast());
+        let out: [i16; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s16_x4() {
+        let vals: [i16; 16] = crate::array::from_fn(|i| i as i16);
+        let a: int16x4x4_t = transmute(vals);
+        let mut tmp = [0_i16; 16];
+        vst1_s16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int16x4x4_t = vld1_s16_x4(tmp.as_ptr().cast());
+        let out: [i16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s16_x2() {
+        let vals: [i16; 16] = crate::array::from_fn(|i| i as i16);
+        let a: int16x8x2_t = transmute(vals);
+        let mut tmp = [0_i16; 16];
+        vst1q_s16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int16x8x2_t = vld1q_s16_x2(tmp.as_ptr().cast());
+        let out: [i16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s16_x3() {
+        let vals: [i16; 24] = crate::array::from_fn(|i| i as i16);
+        let a: int16x8x3_t = transmute(vals);
+        let mut tmp = [0_i16; 24];
+        vst1q_s16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int16x8x3_t = vld1q_s16_x3(tmp.as_ptr().cast());
+        let out: [i16; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s16_x4() {
+        let vals: [i16; 32] = crate::array::from_fn(|i| i as i16);
+        let a: int16x8x4_t = transmute(vals);
+        let mut tmp = [0_i16; 32];
+        vst1q_s16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int16x8x4_t = vld1q_s16_x4(tmp.as_ptr().cast());
+        let out: [i16; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s32_x2() {
+        let vals: [i32; 4] = crate::array::from_fn(|i| i as i32);
+        let a: int32x2x2_t = transmute(vals);
+        let mut tmp = [0_i32; 4];
+        vst1_s32_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int32x2x2_t = vld1_s32_x2(tmp.as_ptr().cast());
+        let out: [i32; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s32_x3() {
+        let vals: [i32; 6] = crate::array::from_fn(|i| i as i32);
+        let a: int32x2x3_t = transmute(vals);
+        let mut tmp = [0_i32; 6];
+        vst1_s32_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int32x2x3_t = vld1_s32_x3(tmp.as_ptr().cast());
+        let out: [i32; 6] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s32_x4() {
+        let vals: [i32; 8] = crate::array::from_fn(|i| i as i32);
+        let a: int32x2x4_t = transmute(vals);
+        let mut tmp = [0_i32; 8];
+        vst1_s32_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int32x2x4_t = vld1_s32_x4(tmp.as_ptr().cast());
+        let out: [i32; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s32_x2() {
+        let vals: [i32; 8] = crate::array::from_fn(|i| i as i32);
+        let a: int32x4x2_t = transmute(vals);
+        let mut tmp = [0_i32; 8];
+        vst1q_s32_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int32x4x2_t = vld1q_s32_x2(tmp.as_ptr().cast());
+        let out: [i32; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s32_x3() {
+        let vals: [i32; 12] = crate::array::from_fn(|i| i as i32);
+        let a: int32x4x3_t = transmute(vals);
+        let mut tmp = [0_i32; 12];
+        vst1q_s32_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int32x4x3_t = vld1q_s32_x3(tmp.as_ptr().cast());
+        let out: [i32; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s32_x4() {
+        let vals: [i32; 16] = crate::array::from_fn(|i| i as i32);
+        let a: int32x4x4_t = transmute(vals);
+        let mut tmp = [0_i32; 16];
+        vst1q_s32_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int32x4x4_t = vld1q_s32_x4(tmp.as_ptr().cast());
+        let out: [i32; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s64_x2() {
+        let vals: [i64; 2] = crate::array::from_fn(|i| i as i64);
+        let a: int64x1x2_t = transmute(vals);
+        let mut tmp = [0_i64; 2];
+        vst1_s64_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int64x1x2_t = vld1_s64_x2(tmp.as_ptr().cast());
+        let out: [i64; 2] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s64_x3() {
+        let vals: [i64; 3] = crate::array::from_fn(|i| i as i64);
+        let a: int64x1x3_t = transmute(vals);
+        let mut tmp = [0_i64; 3];
+        vst1_s64_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int64x1x3_t = vld1_s64_x3(tmp.as_ptr().cast());
+        let out: [i64; 3] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_s64_x4() {
+        let vals: [i64; 4] = crate::array::from_fn(|i| i as i64);
+        let a: int64x1x4_t = transmute(vals);
+        let mut tmp = [0_i64; 4];
+        vst1_s64_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int64x1x4_t = vld1_s64_x4(tmp.as_ptr().cast());
+        let out: [i64; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s64_x2() {
+        let vals: [i64; 4] = crate::array::from_fn(|i| i as i64);
+        let a: int64x2x2_t = transmute(vals);
+        let mut tmp = [0_i64; 4];
+        vst1q_s64_x2(tmp.as_mut_ptr().cast(), a);
+        let r: int64x2x2_t = vld1q_s64_x2(tmp.as_ptr().cast());
+        let out: [i64; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s64_x3() {
+        let vals: [i64; 6] = crate::array::from_fn(|i| i as i64);
+        let a: int64x2x3_t = transmute(vals);
+        let mut tmp = [0_i64; 6];
+        vst1q_s64_x3(tmp.as_mut_ptr().cast(), a);
+        let r: int64x2x3_t = vld1q_s64_x3(tmp.as_ptr().cast());
+        let out: [i64; 6] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_s64_x4() {
+        let vals: [i64; 8] = crate::array::from_fn(|i| i as i64);
+        let a: int64x2x4_t = transmute(vals);
+        let mut tmp = [0_i64; 8];
+        vst1q_s64_x4(tmp.as_mut_ptr().cast(), a);
+        let r: int64x2x4_t = vld1q_s64_x4(tmp.as_ptr().cast());
+        let out: [i64; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u8_x2() {
+        let vals: [u8; 16] = crate::array::from_fn(|i| i as u8);
+        let a: uint8x8x2_t = transmute(vals);
+        let mut tmp = [0_u8; 16];
+        vst1_u8_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint8x8x2_t = vld1_u8_x2(tmp.as_ptr().cast());
+        let out: [u8; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u8_x3() {
+        let vals: [u8; 24] = crate::array::from_fn(|i| i as u8);
+        let a: uint8x8x3_t = transmute(vals);
+        let mut tmp = [0_u8; 24];
+        vst1_u8_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint8x8x3_t = vld1_u8_x3(tmp.as_ptr().cast());
+        let out: [u8; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u8_x4() {
+        let vals: [u8; 32] = crate::array::from_fn(|i| i as u8);
+        let a: uint8x8x4_t = transmute(vals);
+        let mut tmp = [0_u8; 32];
+        vst1_u8_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint8x8x4_t = vld1_u8_x4(tmp.as_ptr().cast());
+        let out: [u8; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u8_x2() {
+        let vals: [u8; 32] = crate::array::from_fn(|i| i as u8);
+        let a: uint8x16x2_t = transmute(vals);
+        let mut tmp = [0_u8; 32];
+        vst1q_u8_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint8x16x2_t = vld1q_u8_x2(tmp.as_ptr().cast());
+        let out: [u8; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u8_x3() {
+        let vals: [u8; 48] = crate::array::from_fn(|i| i as u8);
+        let a: uint8x16x3_t = transmute(vals);
+        let mut tmp = [0_u8; 48];
+        vst1q_u8_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint8x16x3_t = vld1q_u8_x3(tmp.as_ptr().cast());
+        let out: [u8; 48] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u8_x4() {
+        let vals: [u8; 64] = crate::array::from_fn(|i| i as u8);
+        let a: uint8x16x4_t = transmute(vals);
+        let mut tmp = [0_u8; 64];
+        vst1q_u8_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint8x16x4_t = vld1q_u8_x4(tmp.as_ptr().cast());
+        let out: [u8; 64] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u16_x2() {
+        let vals: [u16; 8] = crate::array::from_fn(|i| i as u16);
+        let a: uint16x4x2_t = transmute(vals);
+        let mut tmp = [0_u16; 8];
+        vst1_u16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint16x4x2_t = vld1_u16_x2(tmp.as_ptr().cast());
+        let out: [u16; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u16_x3() {
+        let vals: [u16; 12] = crate::array::from_fn(|i| i as u16);
+        let a: uint16x4x3_t = transmute(vals);
+        let mut tmp = [0_u16; 12];
+        vst1_u16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint16x4x3_t = vld1_u16_x3(tmp.as_ptr().cast());
+        let out: [u16; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u16_x4() {
+        let vals: [u16; 16] = crate::array::from_fn(|i| i as u16);
+        let a: uint16x4x4_t = transmute(vals);
+        let mut tmp = [0_u16; 16];
+        vst1_u16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint16x4x4_t = vld1_u16_x4(tmp.as_ptr().cast());
+        let out: [u16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u16_x2() {
+        let vals: [u16; 16] = crate::array::from_fn(|i| i as u16);
+        let a: uint16x8x2_t = transmute(vals);
+        let mut tmp = [0_u16; 16];
+        vst1q_u16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint16x8x2_t = vld1q_u16_x2(tmp.as_ptr().cast());
+        let out: [u16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u16_x3() {
+        let vals: [u16; 24] = crate::array::from_fn(|i| i as u16);
+        let a: uint16x8x3_t = transmute(vals);
+        let mut tmp = [0_u16; 24];
+        vst1q_u16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint16x8x3_t = vld1q_u16_x3(tmp.as_ptr().cast());
+        let out: [u16; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u16_x4() {
+        let vals: [u16; 32] = crate::array::from_fn(|i| i as u16);
+        let a: uint16x8x4_t = transmute(vals);
+        let mut tmp = [0_u16; 32];
+        vst1q_u16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint16x8x4_t = vld1q_u16_x4(tmp.as_ptr().cast());
+        let out: [u16; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u32_x2() {
+        let vals: [u32; 4] = crate::array::from_fn(|i| i as u32);
+        let a: uint32x2x2_t = transmute(vals);
+        let mut tmp = [0_u32; 4];
+        vst1_u32_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint32x2x2_t = vld1_u32_x2(tmp.as_ptr().cast());
+        let out: [u32; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u32_x3() {
+        let vals: [u32; 6] = crate::array::from_fn(|i| i as u32);
+        let a: uint32x2x3_t = transmute(vals);
+        let mut tmp = [0_u32; 6];
+        vst1_u32_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint32x2x3_t = vld1_u32_x3(tmp.as_ptr().cast());
+        let out: [u32; 6] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u32_x4() {
+        let vals: [u32; 8] = crate::array::from_fn(|i| i as u32);
+        let a: uint32x2x4_t = transmute(vals);
+        let mut tmp = [0_u32; 8];
+        vst1_u32_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint32x2x4_t = vld1_u32_x4(tmp.as_ptr().cast());
+        let out: [u32; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u32_x2() {
+        let vals: [u32; 8] = crate::array::from_fn(|i| i as u32);
+        let a: uint32x4x2_t = transmute(vals);
+        let mut tmp = [0_u32; 8];
+        vst1q_u32_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint32x4x2_t = vld1q_u32_x2(tmp.as_ptr().cast());
+        let out: [u32; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u32_x3() {
+        let vals: [u32; 12] = crate::array::from_fn(|i| i as u32);
+        let a: uint32x4x3_t = transmute(vals);
+        let mut tmp = [0_u32; 12];
+        vst1q_u32_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint32x4x3_t = vld1q_u32_x3(tmp.as_ptr().cast());
+        let out: [u32; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u32_x4() {
+        let vals: [u32; 16] = crate::array::from_fn(|i| i as u32);
+        let a: uint32x4x4_t = transmute(vals);
+        let mut tmp = [0_u32; 16];
+        vst1q_u32_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint32x4x4_t = vld1q_u32_x4(tmp.as_ptr().cast());
+        let out: [u32; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u64_x2() {
+        let vals: [u64; 2] = crate::array::from_fn(|i| i as u64);
+        let a: uint64x1x2_t = transmute(vals);
+        let mut tmp = [0_u64; 2];
+        vst1_u64_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint64x1x2_t = vld1_u64_x2(tmp.as_ptr().cast());
+        let out: [u64; 2] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u64_x3() {
+        let vals: [u64; 3] = crate::array::from_fn(|i| i as u64);
+        let a: uint64x1x3_t = transmute(vals);
+        let mut tmp = [0_u64; 3];
+        vst1_u64_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint64x1x3_t = vld1_u64_x3(tmp.as_ptr().cast());
+        let out: [u64; 3] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_u64_x4() {
+        let vals: [u64; 4] = crate::array::from_fn(|i| i as u64);
+        let a: uint64x1x4_t = transmute(vals);
+        let mut tmp = [0_u64; 4];
+        vst1_u64_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint64x1x4_t = vld1_u64_x4(tmp.as_ptr().cast());
+        let out: [u64; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u64_x2() {
+        let vals: [u64; 4] = crate::array::from_fn(|i| i as u64);
+        let a: uint64x2x2_t = transmute(vals);
+        let mut tmp = [0_u64; 4];
+        vst1q_u64_x2(tmp.as_mut_ptr().cast(), a);
+        let r: uint64x2x2_t = vld1q_u64_x2(tmp.as_ptr().cast());
+        let out: [u64; 4] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u64_x3() {
+        let vals: [u64; 6] = crate::array::from_fn(|i| i as u64);
+        let a: uint64x2x3_t = transmute(vals);
+        let mut tmp = [0_u64; 6];
+        vst1q_u64_x3(tmp.as_mut_ptr().cast(), a);
+        let r: uint64x2x3_t = vld1q_u64_x3(tmp.as_ptr().cast());
+        let out: [u64; 6] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_u64_x4() {
+        let vals: [u64; 8] = crate::array::from_fn(|i| i as u64);
+        let a: uint64x2x4_t = transmute(vals);
+        let mut tmp = [0_u64; 8];
+        vst1q_u64_x4(tmp.as_mut_ptr().cast(), a);
+        let r: uint64x2x4_t = vld1q_u64_x4(tmp.as_ptr().cast());
+        let out: [u64; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p8_x2() {
+        let vals: [p8; 16] = crate::array::from_fn(|i| i as p8);
+        let a: poly8x8x2_t = transmute(vals);
+        let mut tmp = [0 as p8; 16];
+        vst1_p8_x2(tmp.as_mut_ptr().cast(), a);
+        let r: poly8x8x2_t = vld1_p8_x2(tmp.as_ptr().cast());
+        let out: [p8; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p8_x3() {
+        let vals: [p8; 24] = crate::array::from_fn(|i| i as p8);
+        let a: poly8x8x3_t = transmute(vals);
+        let mut tmp = [0 as p8; 24];
+        vst1_p8_x3(tmp.as_mut_ptr().cast(), a);
+        let r: poly8x8x3_t = vld1_p8_x3(tmp.as_ptr().cast());
+        let out: [p8; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p8_x4() {
+        let vals: [p8; 32] = crate::array::from_fn(|i| i as p8);
+        let a: poly8x8x4_t = transmute(vals);
+        let mut tmp = [0 as p8; 32];
+        vst1_p8_x4(tmp.as_mut_ptr().cast(), a);
+        let r: poly8x8x4_t = vld1_p8_x4(tmp.as_ptr().cast());
+        let out: [p8; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p8_x2() {
+        let vals: [p8; 32] = crate::array::from_fn(|i| i as p8);
+        let a: poly8x16x2_t = transmute(vals);
+        let mut tmp = [0 as p8; 32];
+        vst1q_p8_x2(tmp.as_mut_ptr().cast(), a);
+        let r: poly8x16x2_t = vld1q_p8_x2(tmp.as_ptr().cast());
+        let out: [p8; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p8_x3() {
+        let vals: [p8; 48] = crate::array::from_fn(|i| i as p8);
+        let a: poly8x16x3_t = transmute(vals);
+        let mut tmp = [0 as p8; 48];
+        vst1q_p8_x3(tmp.as_mut_ptr().cast(), a);
+        let r: poly8x16x3_t = vld1q_p8_x3(tmp.as_ptr().cast());
+        let out: [p8; 48] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p8_x4() {
+        let vals: [p8; 64] = crate::array::from_fn(|i| i as p8);
+        let a: poly8x16x4_t = transmute(vals);
+        let mut tmp = [0 as p8; 64];
+        vst1q_p8_x4(tmp.as_mut_ptr().cast(), a);
+        let r: poly8x16x4_t = vld1q_p8_x4(tmp.as_ptr().cast());
+        let out: [p8; 64] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p16_x2() {
+        let vals: [p16; 8] = crate::array::from_fn(|i| i as p16);
+        let a: poly16x4x2_t = transmute(vals);
+        let mut tmp = [0 as p16; 8];
+        vst1_p16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: poly16x4x2_t = vld1_p16_x2(tmp.as_ptr().cast());
+        let out: [p16; 8] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p16_x3() {
+        let vals: [p16; 12] = crate::array::from_fn(|i| i as p16);
+        let a: poly16x4x3_t = transmute(vals);
+        let mut tmp = [0 as p16; 12];
+        vst1_p16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: poly16x4x3_t = vld1_p16_x3(tmp.as_ptr().cast());
+        let out: [p16; 12] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1_p16_x4() {
+        let vals: [p16; 16] = crate::array::from_fn(|i| i as p16);
+        let a: poly16x4x4_t = transmute(vals);
+        let mut tmp = [0 as p16; 16];
+        vst1_p16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: poly16x4x4_t = vld1_p16_x4(tmp.as_ptr().cast());
+        let out: [p16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p16_x2() {
+        let vals: [p16; 16] = crate::array::from_fn(|i| i as p16);
+        let a: poly16x8x2_t = transmute(vals);
+        let mut tmp = [0 as p16; 16];
+        vst1q_p16_x2(tmp.as_mut_ptr().cast(), a);
+        let r: poly16x8x2_t = vld1q_p16_x2(tmp.as_ptr().cast());
+        let out: [p16; 16] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p16_x3() {
+        let vals: [p16; 24] = crate::array::from_fn(|i| i as p16);
+        let a: poly16x8x3_t = transmute(vals);
+        let mut tmp = [0 as p16; 24];
+        vst1q_p16_x3(tmp.as_mut_ptr().cast(), a);
+        let r: poly16x8x3_t = vld1q_p16_x3(tmp.as_ptr().cast());
+        let out: [p16; 24] = transmute(r);
+        assert_eq!(out, vals);
+    }
+
+    #[simd_test(enable = "neon")]
+    unsafe fn test_vld1q_p16_x4() {
+        let vals: [p16; 32] = crate::array::from_fn(|i| i as p16);
+        let a: poly16x8x4_t = transmute(vals);
+        let mut tmp = [0 as p16; 32];
+        vst1q_p16_x4(tmp.as_mut_ptr().cast(), a);
+        let r: poly16x8x4_t = vld1q_p16_x4(tmp.as_ptr().cast());
+        let out: [p16; 32] = transmute(r);
+        assert_eq!(out, vals);
+    }
 }
 
 #[cfg(test)]

From e3feb19a9edf7530e550d064f5485c99d5abeca9 Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Fri, 30 Jan 2026 21:35:56 +0100
Subject: [PATCH 2/4] maybe fix aarch64be unsigned vector tuple loads

---
 .../src/arm_shared/neon/generated.rs          | 1352 ++---------------
 .../spec/neon/arm_shared.spec.yml             |    2 +
 2 files changed, 102 insertions(+), 1252 deletions(-)

diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs
index 3b67208182..2f52e3b52b 100644
--- a/crates/core_arch/src/arm_shared/neon/generated.rs
+++ b/crates/core_arch/src/arm_shared/neon/generated.rs
@@ -17067,7 +17067,6 @@ pub unsafe fn vld1_p64_x4(a: *const p64) -> poly64x1x4_t {
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
@@ -17087,38 +17086,10 @@ pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
     transmute(vld1q_s64_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon,aes")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_p64_x2(a: *const p64) -> poly64x2x2_t {
-    let mut ret_val: poly64x2x2_t = transmute(vld1q_s64_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
@@ -17138,39 +17109,10 @@ pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
     transmute(vld1q_s64_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon,aes")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
-    let mut ret_val: poly64x2x3_t = transmute(vld1q_s64_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon,aes")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
@@ -17189,35 +17131,6 @@ pub unsafe fn vld1q_p64_x3(a: *const p64) -> poly64x2x3_t {
 pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t {
     transmute(vld1q_s64_x4(transmute(a)))
 }
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p64_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon,aes")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v8"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_p64_x4(a: *const p64) -> poly64x2x4_t {
-    let mut ret_val: poly64x2x4_t = transmute(vld1q_s64_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
-    ret_val
-}
 #[doc = "Load multiple single-element structures to one, two, three, or four registers."]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_s8)"]
 #[doc = "## Safety"]
@@ -18071,7 +17984,6 @@ pub unsafe fn vld1q_s64_x4(a: *const i64) -> int64x2x4_t {
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18091,11 +18003,10 @@ pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
     transmute(vld1_s8_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18111,18 +18022,14 @@ pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
-    let mut ret_val: uint8x8x2_t = transmute(vld1_s8_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
+    transmute(vld1_s8_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18138,15 +18045,14 @@ pub unsafe fn vld1_u8_x2(a: *const u8) -> uint8x8x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
-    transmute(vld1_s8_x3(transmute(a)))
+pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
+    transmute(vld1_s8_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18162,19 +18068,14 @@ pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
-    let mut ret_val: uint8x8x3_t = transmute(vld1_s8_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
+    transmute(vld1q_s8_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18190,15 +18091,14 @@ pub unsafe fn vld1_u8_x3(a: *const u8) -> uint8x8x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
-    transmute(vld1_s8_x4(transmute(a)))
+pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
+    transmute(vld1q_s8_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18214,20 +18114,14 @@ pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
-    let mut ret_val: uint8x8x4_t = transmute(vld1_s8_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
+    transmute(vld1q_s8_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18243,15 +18137,14 @@ pub unsafe fn vld1_u8_x4(a: *const u8) -> uint8x8x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
-    transmute(vld1q_s8_x2(transmute(a)))
+pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
+    transmute(vld1_s16_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18267,30 +18160,14 @@ pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
-    let mut ret_val: uint8x16x2_t = transmute(vld1q_s8_x2(transmute(a)));
-    ret_val.0 = unsafe {
-        simd_shuffle!(
-            ret_val.0,
-            ret_val.0,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.1 = unsafe {
-        simd_shuffle!(
-            ret_val.1,
-            ret_val.1,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val
+pub unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t {
+    transmute(vld1_s16_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18306,15 +18183,14 @@ pub unsafe fn vld1q_u8_x2(a: *const u8) -> uint8x16x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
-    transmute(vld1q_s8_x3(transmute(a)))
+pub unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t {
+    transmute(vld1_s16_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18330,37 +18206,14 @@ pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
-    let mut ret_val: uint8x16x3_t = transmute(vld1q_s8_x3(transmute(a)));
-    ret_val.0 = unsafe {
-        simd_shuffle!(
-            ret_val.0,
-            ret_val.0,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.1 = unsafe {
-        simd_shuffle!(
-            ret_val.1,
-            ret_val.1,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.2 = unsafe {
-        simd_shuffle!(
-            ret_val.2,
-            ret_val.2,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val
+pub unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
+    transmute(vld1q_s16_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18376,15 +18229,14 @@ pub unsafe fn vld1q_u8_x3(a: *const u8) -> uint8x16x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
-    transmute(vld1q_s8_x4(transmute(a)))
+pub unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t {
+    transmute(vld1q_s16_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18400,44 +18252,14 @@ pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
-    let mut ret_val: uint8x16x4_t = transmute(vld1q_s8_x4(transmute(a)));
-    ret_val.0 = unsafe {
-        simd_shuffle!(
-            ret_val.0,
-            ret_val.0,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.1 = unsafe {
-        simd_shuffle!(
-            ret_val.1,
-            ret_val.1,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.2 = unsafe {
-        simd_shuffle!(
-            ret_val.2,
-            ret_val.2,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.3 = unsafe {
-        simd_shuffle!(
-            ret_val.3,
-            ret_val.3,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val
+pub unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
+    transmute(vld1q_s16_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18453,15 +18275,14 @@ pub unsafe fn vld1q_u8_x4(a: *const u8) -> uint8x16x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
-    transmute(vld1_s16_x2(transmute(a)))
+pub unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t {
+    transmute(vld1_s32_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18477,18 +18298,14 @@ pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
-    let mut ret_val: uint16x4x2_t = transmute(vld1_s16_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t {
+    transmute(vld1_s32_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -18504,840 +18321,14 @@ pub unsafe fn vld1_u16_x2(a: *const u16) -> uint16x4x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t {
-    transmute(vld1_s16_x3(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u16_x3(a: *const u16) -> uint16x4x3_t {
-    let mut ret_val: uint16x4x3_t = transmute(vld1_s16_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t {
-    transmute(vld1_s16_x4(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u16_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u16_x4(a: *const u16) -> uint16x4x4_t {
-    let mut ret_val: uint16x4x4_t = transmute(vld1_s16_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
-    transmute(vld1q_s16_x2(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u16_x2(a: *const u16) -> uint16x8x2_t {
-    let mut ret_val: uint16x8x2_t = transmute(vld1q_s16_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t {
-    transmute(vld1q_s16_x3(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u16_x3(a: *const u16) -> uint16x8x3_t {
-    let mut ret_val: uint16x8x3_t = transmute(vld1q_s16_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
-    transmute(vld1q_s16_x4(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u16_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u16_x4(a: *const u16) -> uint16x8x4_t {
-    let mut ret_val: uint16x8x4_t = transmute(vld1q_s16_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t {
-    transmute(vld1_s32_x2(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u32_x2(a: *const u32) -> uint32x2x2_t {
-    let mut ret_val: uint32x2x2_t = transmute(vld1_s32_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t {
-    transmute(vld1_s32_x3(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u32_x3(a: *const u32) -> uint32x2x3_t {
-    let mut ret_val: uint32x2x3_t = transmute(vld1_s32_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t {
-    transmute(vld1_s32_x4(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u32_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t {
-    let mut ret_val: uint32x2x4_t = transmute(vld1_s32_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t {
-    transmute(vld1q_s32_x2(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t {
-    let mut ret_val: uint32x4x2_t = transmute(vld1q_s32_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t {
-    transmute(vld1q_s32_x3(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t {
-    let mut ret_val: uint32x4x3_t = transmute(vld1q_s32_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t {
-    transmute(vld1q_s32_x4(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t {
-    let mut ret_val: uint32x4x4_t = transmute(vld1q_s32_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u64_x2(a: *const u64) -> uint64x1x2_t {
-    transmute(vld1_s64_x2(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u64_x3(a: *const u64) -> uint64x1x3_t {
-    transmute(vld1_s64_x3(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_u64_x4(a: *const u64) -> uint64x1x4_t {
-    transmute(vld1_s64_x4(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t {
-    transmute(vld1q_s64_x2(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t {
-    let mut ret_val: uint64x2x2_t = transmute(vld1q_s64_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t {
-    transmute(vld1q_s64_x3(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x3)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t {
-    let mut ret_val: uint64x2x3_t = transmute(vld1q_s64_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t {
-    transmute(vld1q_s64_x4(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t {
-    let mut ret_val: uint64x2x4_t = transmute(vld1q_s64_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [1, 0]) };
-    ret_val
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "little")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
-    transmute(vld1_s8_x2(transmute(a)))
-}
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x2)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
-    let mut ret_val: poly8x8x2_t = transmute(vld1_s8_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_u32_x4(a: *const u32) -> uint32x2x4_t {
+    transmute(vld1_s32_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19353,15 +18344,14 @@ pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
-    transmute(vld1_s8_x3(transmute(a)))
+pub unsafe fn vld1q_u32_x2(a: *const u32) -> uint32x4x2_t {
+    transmute(vld1q_s32_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19377,19 +18367,14 @@ pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
-    let mut ret_val: poly8x8x3_t = transmute(vld1_s8_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1q_u32_x3(a: *const u32) -> uint32x4x3_t {
+    transmute(vld1q_s32_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u32_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19405,15 +18390,14 @@ pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
-    transmute(vld1_s8_x4(transmute(a)))
+pub unsafe fn vld1q_u32_x4(a: *const u32) -> uint32x4x4_t {
+    transmute(vld1q_s32_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19429,20 +18413,14 @@ pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
-    let mut ret_val: poly8x8x4_t = transmute(vld1_s8_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_u64_x2(a: *const u64) -> uint64x1x2_t {
+    transmute(vld1_s64_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19458,15 +18436,14 @@ pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
-    transmute(vld1q_s8_x2(transmute(a)))
+pub unsafe fn vld1_u64_x3(a: *const u64) -> uint64x1x3_t {
+    transmute(vld1_s64_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_u64_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19482,30 +18459,14 @@ pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
-    let mut ret_val: poly8x16x2_t = transmute(vld1q_s8_x2(transmute(a)));
-    ret_val.0 = unsafe {
-        simd_shuffle!(
-            ret_val.0,
-            ret_val.0,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.1 = unsafe {
-        simd_shuffle!(
-            ret_val.1,
-            ret_val.1,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val
+pub unsafe fn vld1_u64_x4(a: *const u64) -> uint64x1x4_t {
+    transmute(vld1_s64_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19521,15 +18482,14 @@ pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
-    transmute(vld1q_s8_x3(transmute(a)))
+pub unsafe fn vld1q_u64_x2(a: *const u64) -> uint64x2x2_t {
+    transmute(vld1q_s64_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19545,37 +18505,14 @@ pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
-    let mut ret_val: poly8x16x3_t = transmute(vld1q_s8_x3(transmute(a)));
-    ret_val.0 = unsafe {
-        simd_shuffle!(
-            ret_val.0,
-            ret_val.0,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.1 = unsafe {
-        simd_shuffle!(
-            ret_val.1,
-            ret_val.1,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.2 = unsafe {
-        simd_shuffle!(
-            ret_val.2,
-            ret_val.2,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val
+pub unsafe fn vld1q_u64_x3(a: *const u64) -> uint64x2x3_t {
+    transmute(vld1q_s64_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_u64_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19591,15 +18528,14 @@ pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
-    transmute(vld1q_s8_x4(transmute(a)))
+pub unsafe fn vld1q_u64_x4(a: *const u64) -> uint64x2x4_t {
+    transmute(vld1q_s64_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19615,44 +18551,14 @@ pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
-    let mut ret_val: poly8x16x4_t = transmute(vld1q_s8_x4(transmute(a)));
-    ret_val.0 = unsafe {
-        simd_shuffle!(
-            ret_val.0,
-            ret_val.0,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.1 = unsafe {
-        simd_shuffle!(
-            ret_val.1,
-            ret_val.1,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.2 = unsafe {
-        simd_shuffle!(
-            ret_val.2,
-            ret_val.2,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val.3 = unsafe {
-        simd_shuffle!(
-            ret_val.3,
-            ret_val.3,
-            [15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
-        )
-    };
-    ret_val
+pub unsafe fn vld1_p8_x2(a: *const p8) -> poly8x8x2_t {
+    transmute(vld1_s8_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19668,15 +18574,14 @@ pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
-    transmute(vld1_s16_x2(transmute(a)))
+pub unsafe fn vld1_p8_x3(a: *const p8) -> poly8x8x3_t {
+    transmute(vld1_s8_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p8_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19692,18 +18597,14 @@ pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
-    let mut ret_val: poly16x4x2_t = transmute(vld1_s16_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_p8_x4(a: *const p8) -> poly8x8x4_t {
+    transmute(vld1_s8_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19719,15 +18620,14 @@ pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
-    transmute(vld1_s16_x3(transmute(a)))
+pub unsafe fn vld1q_p8_x2(a: *const p8) -> poly8x16x2_t {
+    transmute(vld1q_s8_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19743,19 +18643,14 @@ pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
-    let mut ret_val: poly16x4x3_t = transmute(vld1_s16_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1q_p8_x3(a: *const p8) -> poly8x16x3_t {
+    transmute(vld1q_s8_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p8_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19771,15 +18666,14 @@ pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
-    transmute(vld1_s16_x4(transmute(a)))
+pub unsafe fn vld1q_p8_x4(a: *const p8) -> poly8x16x4_t {
+    transmute(vld1q_s8_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x4)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19795,20 +18689,14 @@ pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
-    let mut ret_val: poly16x4x4_t = transmute(vld1_s16_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_p16_x2(a: *const p16) -> poly16x4x2_t {
+    transmute(vld1_s16_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19824,15 +18712,14 @@ pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
-    transmute(vld1q_s16_x2(transmute(a)))
+pub unsafe fn vld1_p16_x3(a: *const p16) -> poly16x4x3_t {
+    transmute(vld1_s16_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x2)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1_p16_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19848,18 +18735,14 @@ pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
-    let mut ret_val: poly16x8x2_t = transmute(vld1q_s16_x2(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+pub unsafe fn vld1_p16_x4(a: *const p16) -> poly16x4x4_t {
+    transmute(vld1_s16_x4(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x3)"]
+#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x2)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19875,15 +18758,14 @@ pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
     target_arch = "arm",
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
-pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
-    transmute(vld1q_s16_x3(transmute(a)))
+pub unsafe fn vld1q_p16_x2(a: *const p16) -> poly16x8x2_t {
+    transmute(vld1q_s16_x2(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x3)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "big")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19900,18 +18782,13 @@ pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
     unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
 )]
 pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
-    let mut ret_val: poly16x8x3_t = transmute(vld1q_s16_x3(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
+    transmute(vld1q_s16_x3(transmute(a)))
 }
 #[doc = "Load multiple single-element structures to one, two, three, or four registers"]
 #[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x4)"]
 #[doc = "## Safety"]
 #[doc = "  * Neon intrinsic unsafe"]
 #[inline(always)]
-#[cfg(target_endian = "little")]
 #[target_feature(enable = "neon")]
 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
 #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
@@ -19930,35 +18807,6 @@ pub unsafe fn vld1q_p16_x3(a: *const p16) -> poly16x8x3_t {
 pub unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t {
     transmute(vld1q_s16_x4(transmute(a)))
 }
-#[doc = "Load multiple single-element structures to one, two, three, or four registers"]
-#[doc = "[Arm's documentation](https://developer.arm.com/architectures/instruction-sets/intrinsics/vld1q_p16_x4)"]
-#[doc = "## Safety"]
-#[doc = "  * Neon intrinsic unsafe"]
-#[inline(always)]
-#[cfg(target_endian = "big")]
-#[target_feature(enable = "neon")]
-#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
-#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld1))]
-#[cfg_attr(
-    all(test, any(target_arch = "aarch64", target_arch = "arm64ec")),
-    assert_instr(ld1)
-)]
-#[cfg_attr(
-    not(target_arch = "arm"),
-    stable(feature = "neon_intrinsics", since = "1.59.0")
-)]
-#[cfg_attr(
-    target_arch = "arm",
-    unstable(feature = "stdarch_arm_neon_intrinsics", issue = "111800")
-)]
-pub unsafe fn vld1q_p16_x4(a: *const p16) -> poly16x8x4_t {
-    let mut ret_val: poly16x8x4_t = transmute(vld1q_s16_x4(transmute(a)));
-    ret_val.0 = unsafe { simd_shuffle!(ret_val.0, ret_val.0, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.1 = unsafe { simd_shuffle!(ret_val.1, ret_val.1, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.2 = unsafe { simd_shuffle!(ret_val.2, ret_val.2, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val.3 = unsafe { simd_shuffle!(ret_val.3, ret_val.3, [7, 6, 5, 4, 3, 2, 1, 0]) };
-    ret_val
-}
 #[inline(always)]
 #[rustc_legacy_const_generics(1)]
 #[cfg(target_arch = "arm")]
diff --git a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
index 52748a4cc0..3ec7ba8814 100644
--- a/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
+++ b/crates/stdarch-gen-arm/spec/neon/arm_shared.spec.yml
@@ -2681,6 +2681,7 @@ intrinsics:
       - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
+    big_endian_inverse: false
     safety:
       unsafe: [neon]
     types:
@@ -2740,6 +2741,7 @@ intrinsics:
       - FnCall: [cfg_attr, [*neon-target-aarch64-arm64ec, {FnCall: [assert_instr, [ld1]]}]]
       - *neon-not-arm-stable
       - *neon-cfg-arm-unstable
+    big_endian_inverse: false
     safety:
       unsafe: [neon]
     types:

From fe9aa9cc5a39291aff9c86c24bff1eba8a1b668b Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Sun, 1 Feb 2026 13:50:20 +0100
Subject: [PATCH 3/4] use macro for wide store/load roundtrip tests

---
 crates/core_arch/src/aarch64/neon/mod.rs | 636 ++++-------------------
 1 file changed, 90 insertions(+), 546 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index feaf94a7f9..ee27bef973 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -994,868 +994,412 @@ mod tests {
         assert_eq!(vals[2], 2.);
     }
 
+    macro_rules! wide_store_load_roundtrip {
+        ($elem_ty:ty, $len:expr, $vec_ty:ty, $store:expr, $load:expr) => {
+            let vals: [$elem_ty; $len] = crate::array::from_fn(|i| i as $elem_ty);
+            let a: $vec_ty = transmute(vals);
+            let mut tmp = [0 as $elem_ty; $len];
+            $store(tmp.as_mut_ptr().cast(), a);
+            let r: $vec_ty = $load(tmp.as_ptr().cast());
+            let out: [$elem_ty; $len] = transmute(r);
+            assert_eq!(out, vals);
+        };
+    }
+
     #[simd_test(enable = "neon,fp16")]
     #[cfg(not(target_arch = "arm64ec"))]
     unsafe fn test_vld1_f16_x2() {
-        let vals: [f16; 8] = crate::array::from_fn(|i| i as f16);
-        let a: float16x4x2_t = transmute(vals);
-        let mut tmp = [0_f16; 8];
-        vst1_f16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: float16x4x2_t = vld1_f16_x2(tmp.as_ptr().cast());
-        let out: [f16; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f16, 8, float16x4x2_t, vst1_f16_x2, vld1_f16_x2);
     }
 
     #[simd_test(enable = "neon,fp16")]
     #[cfg(not(target_arch = "arm64ec"))]
     unsafe fn test_vld1_f16_x3() {
-        let vals: [f16; 12] = crate::array::from_fn(|i| i as f16);
-        let a: float16x4x3_t = transmute(vals);
-        let mut tmp = [0_f16; 12];
-        vst1_f16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: float16x4x3_t = vld1_f16_x3(tmp.as_ptr().cast());
-        let out: [f16; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f16, 12, float16x4x3_t, vst1_f16_x3, vld1_f16_x3);
     }
 
     #[simd_test(enable = "neon,fp16")]
     #[cfg(not(target_arch = "arm64ec"))]
     unsafe fn test_vld1_f16_x4() {
-        let vals: [f16; 16] = crate::array::from_fn(|i| i as f16);
-        let a: float16x4x4_t = transmute(vals);
-        let mut tmp = [0_f16; 16];
-        vst1_f16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: float16x4x4_t = vld1_f16_x4(tmp.as_ptr().cast());
-        let out: [f16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f16, 16, float16x4x4_t, vst1_f16_x4, vld1_f16_x4);
     }
 
     #[simd_test(enable = "neon,fp16")]
     #[cfg(not(target_arch = "arm64ec"))]
     unsafe fn test_vld1q_f16_x2() {
-        let vals: [f16; 16] = crate::array::from_fn(|i| i as f16);
-        let a: float16x8x2_t = transmute(vals);
-        let mut tmp = [0_f16; 16];
-        vst1q_f16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: float16x8x2_t = vld1q_f16_x2(tmp.as_ptr().cast());
-        let out: [f16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f16, 16, float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2);
     }
 
     #[simd_test(enable = "neon,fp16")]
     #[cfg(not(target_arch = "arm64ec"))]
     unsafe fn test_vld1q_f16_x3() {
-        let vals: [f16; 24] = crate::array::from_fn(|i| i as f16);
-        let a: float16x8x3_t = transmute(vals);
-        let mut tmp = [0_f16; 24];
-        vst1q_f16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: float16x8x3_t = vld1q_f16_x3(tmp.as_ptr().cast());
-        let out: [f16; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
     }
 
     #[simd_test(enable = "neon,fp16")]
     #[cfg(not(target_arch = "arm64ec"))]
     unsafe fn test_vld1q_f16_x4() {
-        let vals: [f16; 32] = crate::array::from_fn(|i| i as f16);
-        let a: float16x8x4_t = transmute(vals);
-        let mut tmp = [0_f16; 32];
-        vst1q_f16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: float16x8x4_t = vld1q_f16_x4(tmp.as_ptr().cast());
-        let out: [f16; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_f32_x2() {
-        let vals: [f32; 4] = crate::array::from_fn(|i| i as f32);
-        let a: float32x2x2_t = transmute(vals);
-        let mut tmp = [0_f32; 4];
-        vst1_f32_x2(tmp.as_mut_ptr().cast(), a);
-        let r: float32x2x2_t = vld1_f32_x2(tmp.as_ptr().cast());
-        let out: [f32; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f32, 4, float32x2x2_t, vst1_f32_x2, vld1_f32_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_f32_x3() {
-        let vals: [f32; 6] = crate::array::from_fn(|i| i as f32);
-        let a: float32x2x3_t = transmute(vals);
-        let mut tmp = [0_f32; 6];
-        vst1_f32_x3(tmp.as_mut_ptr().cast(), a);
-        let r: float32x2x3_t = vld1_f32_x3(tmp.as_ptr().cast());
-        let out: [f32; 6] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f32, 6, float32x2x3_t, vst1_f32_x3, vld1_f32_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_f32_x4() {
-        let vals: [f32; 8] = crate::array::from_fn(|i| i as f32);
-        let a: float32x2x4_t = transmute(vals);
-        let mut tmp = [0_f32; 8];
-        vst1_f32_x4(tmp.as_mut_ptr().cast(), a);
-        let r: float32x2x4_t = vld1_f32_x4(tmp.as_ptr().cast());
-        let out: [f32; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f32, 8, float32x2x4_t, vst1_f32_x4, vld1_f32_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_f32_x2() {
-        let vals: [f32; 8] = crate::array::from_fn(|i| i as f32);
-        let a: float32x4x2_t = transmute(vals);
-        let mut tmp = [0_f32; 8];
-        vst1q_f32_x2(tmp.as_mut_ptr().cast(), a);
-        let r: float32x4x2_t = vld1q_f32_x2(tmp.as_ptr().cast());
-        let out: [f32; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f32, 8, float32x4x2_t, vst1q_f32_x2, vld1q_f32_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_f32_x3() {
-        let vals: [f32; 12] = crate::array::from_fn(|i| i as f32);
-        let a: float32x4x3_t = transmute(vals);
-        let mut tmp = [0_f32; 12];
-        vst1q_f32_x3(tmp.as_mut_ptr().cast(), a);
-        let r: float32x4x3_t = vld1q_f32_x3(tmp.as_ptr().cast());
-        let out: [f32; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f32, 12, float32x4x3_t, vst1q_f32_x3, vld1q_f32_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_f32_x4() {
-        let vals: [f32; 16] = crate::array::from_fn(|i| i as f32);
-        let a: float32x4x4_t = transmute(vals);
-        let mut tmp = [0_f32; 16];
-        vst1q_f32_x4(tmp.as_mut_ptr().cast(), a);
-        let r: float32x4x4_t = vld1q_f32_x4(tmp.as_ptr().cast());
-        let out: [f32; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(f32, 16, float32x4x4_t, vst1q_f32_x4, vld1q_f32_x4);
     }
 
     #[simd_test(enable = "neon,aes")]
     unsafe fn test_vld1_p64_x2() {
-        let vals: [p64; 2] = crate::array::from_fn(|i| i as p64);
-        let a: poly64x1x2_t = transmute(vals);
-        let mut tmp = [0 as p64; 2];
-        vst1_p64_x2(tmp.as_mut_ptr().cast(), a);
-        let r: poly64x1x2_t = vld1_p64_x2(tmp.as_ptr().cast());
-        let out: [p64; 2] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p64, 2, poly64x1x2_t, vst1_p64_x2, vld1_p64_x2);
     }
 
     #[simd_test(enable = "neon,aes")]
     unsafe fn test_vld1_p64_x3() {
-        let vals: [p64; 3] = crate::array::from_fn(|i| i as p64);
-        let a: poly64x1x3_t = transmute(vals);
-        let mut tmp = [0 as p64; 3];
-        vst1_p64_x3(tmp.as_mut_ptr().cast(), a);
-        let r: poly64x1x3_t = vld1_p64_x3(tmp.as_ptr().cast());
-        let out: [p64; 3] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p64, 3, poly64x1x3_t, vst1_p64_x3, vld1_p64_x3);
     }
 
     #[simd_test(enable = "neon,aes")]
     unsafe fn test_vld1_p64_x4() {
-        let vals: [p64; 4] = crate::array::from_fn(|i| i as p64);
-        let a: poly64x1x4_t = transmute(vals);
-        let mut tmp = [0 as p64; 4];
-        vst1_p64_x4(tmp.as_mut_ptr().cast(), a);
-        let r: poly64x1x4_t = vld1_p64_x4(tmp.as_ptr().cast());
-        let out: [p64; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p64, 4, poly64x1x4_t, vst1_p64_x4, vld1_p64_x4);
     }
 
     #[simd_test(enable = "neon,aes")]
     unsafe fn test_vld1q_p64_x2() {
-        let vals: [p64; 4] = crate::array::from_fn(|i| i as p64);
-        let a: poly64x2x2_t = transmute(vals);
-        let mut tmp = [0 as p64; 4];
-        vst1q_p64_x2(tmp.as_mut_ptr().cast(), a);
-        let r: poly64x2x2_t = vld1q_p64_x2(tmp.as_ptr().cast());
-        let out: [p64; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p64, 4, poly64x2x2_t, vst1q_p64_x2, vld1q_p64_x2);
     }
 
     #[simd_test(enable = "neon,aes")]
     unsafe fn test_vld1q_p64_x3() {
-        let vals: [p64; 6] = crate::array::from_fn(|i| i as p64);
-        let a: poly64x2x3_t = transmute(vals);
-        let mut tmp = [0 as p64; 6];
-        vst1q_p64_x3(tmp.as_mut_ptr().cast(), a);
-        let r: poly64x2x3_t = vld1q_p64_x3(tmp.as_ptr().cast());
-        let out: [p64; 6] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p64, 6, poly64x2x3_t, vst1q_p64_x3, vld1q_p64_x3);
     }
 
     #[simd_test(enable = "neon,aes")]
     unsafe fn test_vld1q_p64_x4() {
-        let vals: [p64; 8] = crate::array::from_fn(|i| i as p64);
-        let a: poly64x2x4_t = transmute(vals);
-        let mut tmp = [0 as p64; 8];
-        vst1q_p64_x4(tmp.as_mut_ptr().cast(), a);
-        let r: poly64x2x4_t = vld1q_p64_x4(tmp.as_ptr().cast());
-        let out: [p64; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p64, 8, poly64x2x4_t, vst1q_p64_x4, vld1q_p64_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s8_x2() {
-        let vals: [i8; 16] = crate::array::from_fn(|i| i as i8);
-        let a: int8x8x2_t = transmute(vals);
-        let mut tmp = [0_i8; 16];
-        vst1_s8_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int8x8x2_t = vld1_s8_x2(tmp.as_ptr().cast());
-        let out: [i8; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i8, 16, int8x8x2_t, vst1_s8_x2, vld1_s8_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s8_x3() {
-        let vals: [i8; 24] = crate::array::from_fn(|i| i as i8);
-        let a: int8x8x3_t = transmute(vals);
-        let mut tmp = [0_i8; 24];
-        vst1_s8_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int8x8x3_t = vld1_s8_x3(tmp.as_ptr().cast());
-        let out: [i8; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i8, 24, int8x8x3_t, vst1_s8_x3, vld1_s8_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s8_x4() {
-        let vals: [i8; 32] = crate::array::from_fn(|i| i as i8);
-        let a: int8x8x4_t = transmute(vals);
-        let mut tmp = [0_i8; 32];
-        vst1_s8_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int8x8x4_t = vld1_s8_x4(tmp.as_ptr().cast());
-        let out: [i8; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i8, 32, int8x8x4_t, vst1_s8_x4, vld1_s8_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s8_x2() {
-        let vals: [i8; 32] = crate::array::from_fn(|i| i as i8);
-        let a: int8x16x2_t = transmute(vals);
-        let mut tmp = [0_i8; 32];
-        vst1q_s8_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int8x16x2_t = vld1q_s8_x2(tmp.as_ptr().cast());
-        let out: [i8; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i8, 32, int8x16x2_t, vst1q_s8_x2, vld1q_s8_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s8_x3() {
-        let vals: [i8; 48] = crate::array::from_fn(|i| i as i8);
-        let a: int8x16x3_t = transmute(vals);
-        let mut tmp = [0_i8; 48];
-        vst1q_s8_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int8x16x3_t = vld1q_s8_x3(tmp.as_ptr().cast());
-        let out: [i8; 48] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i8, 48, int8x16x3_t, vst1q_s8_x3, vld1q_s8_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s8_x4() {
-        let vals: [i8; 64] = crate::array::from_fn(|i| i as i8);
-        let a: int8x16x4_t = transmute(vals);
-        let mut tmp = [0_i8; 64];
-        vst1q_s8_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int8x16x4_t = vld1q_s8_x4(tmp.as_ptr().cast());
-        let out: [i8; 64] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i8, 64, int8x16x4_t, vst1q_s8_x4, vld1q_s8_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s16_x2() {
-        let vals: [i16; 8] = crate::array::from_fn(|i| i as i16);
-        let a: int16x4x2_t = transmute(vals);
-        let mut tmp = [0_i16; 8];
-        vst1_s16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int16x4x2_t = vld1_s16_x2(tmp.as_ptr().cast());
-        let out: [i16; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i16, 8, int16x4x2_t, vst1_s16_x2, vld1_s16_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s16_x3() {
-        let vals: [i16; 12] = crate::array::from_fn(|i| i as i16);
-        let a: int16x4x3_t = transmute(vals);
-        let mut tmp = [0_i16; 12];
-        vst1_s16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int16x4x3_t = vld1_s16_x3(tmp.as_ptr().cast());
-        let out: [i16; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i16, 12, int16x4x3_t, vst1_s16_x3, vld1_s16_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s16_x4() {
-        let vals: [i16; 16] = crate::array::from_fn(|i| i as i16);
-        let a: int16x4x4_t = transmute(vals);
-        let mut tmp = [0_i16; 16];
-        vst1_s16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int16x4x4_t = vld1_s16_x4(tmp.as_ptr().cast());
-        let out: [i16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i16, 16, int16x4x4_t, vst1_s16_x4, vld1_s16_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s16_x2() {
-        let vals: [i16; 16] = crate::array::from_fn(|i| i as i16);
-        let a: int16x8x2_t = transmute(vals);
-        let mut tmp = [0_i16; 16];
-        vst1q_s16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int16x8x2_t = vld1q_s16_x2(tmp.as_ptr().cast());
-        let out: [i16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i16, 16, int16x8x2_t, vst1q_s16_x2, vld1q_s16_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s16_x3() {
-        let vals: [i16; 24] = crate::array::from_fn(|i| i as i16);
-        let a: int16x8x3_t = transmute(vals);
-        let mut tmp = [0_i16; 24];
-        vst1q_s16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int16x8x3_t = vld1q_s16_x3(tmp.as_ptr().cast());
-        let out: [i16; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i16, 24, int16x8x3_t, vst1q_s16_x3, vld1q_s16_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s16_x4() {
-        let vals: [i16; 32] = crate::array::from_fn(|i| i as i16);
-        let a: int16x8x4_t = transmute(vals);
-        let mut tmp = [0_i16; 32];
-        vst1q_s16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int16x8x4_t = vld1q_s16_x4(tmp.as_ptr().cast());
-        let out: [i16; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i16, 32, int16x8x4_t, vst1q_s16_x4, vld1q_s16_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s32_x2() {
-        let vals: [i32; 4] = crate::array::from_fn(|i| i as i32);
-        let a: int32x2x2_t = transmute(vals);
-        let mut tmp = [0_i32; 4];
-        vst1_s32_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int32x2x2_t = vld1_s32_x2(tmp.as_ptr().cast());
-        let out: [i32; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i32, 4, int32x2x2_t, vst1_s32_x2, vld1_s32_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s32_x3() {
-        let vals: [i32; 6] = crate::array::from_fn(|i| i as i32);
-        let a: int32x2x3_t = transmute(vals);
-        let mut tmp = [0_i32; 6];
-        vst1_s32_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int32x2x3_t = vld1_s32_x3(tmp.as_ptr().cast());
-        let out: [i32; 6] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i32, 6, int32x2x3_t, vst1_s32_x3, vld1_s32_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s32_x4() {
-        let vals: [i32; 8] = crate::array::from_fn(|i| i as i32);
-        let a: int32x2x4_t = transmute(vals);
-        let mut tmp = [0_i32; 8];
-        vst1_s32_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int32x2x4_t = vld1_s32_x4(tmp.as_ptr().cast());
-        let out: [i32; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i32, 8, int32x2x4_t, vst1_s32_x4, vld1_s32_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s32_x2() {
-        let vals: [i32; 8] = crate::array::from_fn(|i| i as i32);
-        let a: int32x4x2_t = transmute(vals);
-        let mut tmp = [0_i32; 8];
-        vst1q_s32_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int32x4x2_t = vld1q_s32_x2(tmp.as_ptr().cast());
-        let out: [i32; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i32, 8, int32x4x2_t, vst1q_s32_x2, vld1q_s32_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s32_x3() {
-        let vals: [i32; 12] = crate::array::from_fn(|i| i as i32);
-        let a: int32x4x3_t = transmute(vals);
-        let mut tmp = [0_i32; 12];
-        vst1q_s32_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int32x4x3_t = vld1q_s32_x3(tmp.as_ptr().cast());
-        let out: [i32; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i32, 12, int32x4x3_t, vst1q_s32_x3, vld1q_s32_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s32_x4() {
-        let vals: [i32; 16] = crate::array::from_fn(|i| i as i32);
-        let a: int32x4x4_t = transmute(vals);
-        let mut tmp = [0_i32; 16];
-        vst1q_s32_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int32x4x4_t = vld1q_s32_x4(tmp.as_ptr().cast());
-        let out: [i32; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i32, 16, int32x4x4_t, vst1q_s32_x4, vld1q_s32_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s64_x2() {
-        let vals: [i64; 2] = crate::array::from_fn(|i| i as i64);
-        let a: int64x1x2_t = transmute(vals);
-        let mut tmp = [0_i64; 2];
-        vst1_s64_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int64x1x2_t = vld1_s64_x2(tmp.as_ptr().cast());
-        let out: [i64; 2] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i64, 2, int64x1x2_t, vst1_s64_x2, vld1_s64_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s64_x3() {
-        let vals: [i64; 3] = crate::array::from_fn(|i| i as i64);
-        let a: int64x1x3_t = transmute(vals);
-        let mut tmp = [0_i64; 3];
-        vst1_s64_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int64x1x3_t = vld1_s64_x3(tmp.as_ptr().cast());
-        let out: [i64; 3] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i64, 3, int64x1x3_t, vst1_s64_x3, vld1_s64_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_s64_x4() {
-        let vals: [i64; 4] = crate::array::from_fn(|i| i as i64);
-        let a: int64x1x4_t = transmute(vals);
-        let mut tmp = [0_i64; 4];
-        vst1_s64_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int64x1x4_t = vld1_s64_x4(tmp.as_ptr().cast());
-        let out: [i64; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i64, 4, int64x1x4_t, vst1_s64_x4, vld1_s64_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s64_x2() {
-        let vals: [i64; 4] = crate::array::from_fn(|i| i as i64);
-        let a: int64x2x2_t = transmute(vals);
-        let mut tmp = [0_i64; 4];
-        vst1q_s64_x2(tmp.as_mut_ptr().cast(), a);
-        let r: int64x2x2_t = vld1q_s64_x2(tmp.as_ptr().cast());
-        let out: [i64; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i64, 4, int64x2x2_t, vst1q_s64_x2, vld1q_s64_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s64_x3() {
-        let vals: [i64; 6] = crate::array::from_fn(|i| i as i64);
-        let a: int64x2x3_t = transmute(vals);
-        let mut tmp = [0_i64; 6];
-        vst1q_s64_x3(tmp.as_mut_ptr().cast(), a);
-        let r: int64x2x3_t = vld1q_s64_x3(tmp.as_ptr().cast());
-        let out: [i64; 6] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i64, 6, int64x2x3_t, vst1q_s64_x3, vld1q_s64_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_s64_x4() {
-        let vals: [i64; 8] = crate::array::from_fn(|i| i as i64);
-        let a: int64x2x4_t = transmute(vals);
-        let mut tmp = [0_i64; 8];
-        vst1q_s64_x4(tmp.as_mut_ptr().cast(), a);
-        let r: int64x2x4_t = vld1q_s64_x4(tmp.as_ptr().cast());
-        let out: [i64; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(i64, 8, int64x2x4_t, vst1q_s64_x4, vld1q_s64_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u8_x2() {
-        let vals: [u8; 16] = crate::array::from_fn(|i| i as u8);
-        let a: uint8x8x2_t = transmute(vals);
-        let mut tmp = [0_u8; 16];
-        vst1_u8_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint8x8x2_t = vld1_u8_x2(tmp.as_ptr().cast());
-        let out: [u8; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u8, 16, uint8x8x2_t, vst1_u8_x2, vld1_u8_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u8_x3() {
-        let vals: [u8; 24] = crate::array::from_fn(|i| i as u8);
-        let a: uint8x8x3_t = transmute(vals);
-        let mut tmp = [0_u8; 24];
-        vst1_u8_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint8x8x3_t = vld1_u8_x3(tmp.as_ptr().cast());
-        let out: [u8; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u8, 24, uint8x8x3_t, vst1_u8_x3, vld1_u8_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u8_x4() {
-        let vals: [u8; 32] = crate::array::from_fn(|i| i as u8);
-        let a: uint8x8x4_t = transmute(vals);
-        let mut tmp = [0_u8; 32];
-        vst1_u8_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint8x8x4_t = vld1_u8_x4(tmp.as_ptr().cast());
-        let out: [u8; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u8, 32, uint8x8x4_t, vst1_u8_x4, vld1_u8_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u8_x2() {
-        let vals: [u8; 32] = crate::array::from_fn(|i| i as u8);
-        let a: uint8x16x2_t = transmute(vals);
-        let mut tmp = [0_u8; 32];
-        vst1q_u8_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint8x16x2_t = vld1q_u8_x2(tmp.as_ptr().cast());
-        let out: [u8; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u8, 32, uint8x16x2_t, vst1q_u8_x2, vld1q_u8_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u8_x3() {
-        let vals: [u8; 48] = crate::array::from_fn(|i| i as u8);
-        let a: uint8x16x3_t = transmute(vals);
-        let mut tmp = [0_u8; 48];
-        vst1q_u8_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint8x16x3_t = vld1q_u8_x3(tmp.as_ptr().cast());
-        let out: [u8; 48] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u8, 48, uint8x16x3_t, vst1q_u8_x3, vld1q_u8_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u8_x4() {
-        let vals: [u8; 64] = crate::array::from_fn(|i| i as u8);
-        let a: uint8x16x4_t = transmute(vals);
-        let mut tmp = [0_u8; 64];
-        vst1q_u8_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint8x16x4_t = vld1q_u8_x4(tmp.as_ptr().cast());
-        let out: [u8; 64] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u8, 64, uint8x16x4_t, vst1q_u8_x4, vld1q_u8_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u16_x2() {
-        let vals: [u16; 8] = crate::array::from_fn(|i| i as u16);
-        let a: uint16x4x2_t = transmute(vals);
-        let mut tmp = [0_u16; 8];
-        vst1_u16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint16x4x2_t = vld1_u16_x2(tmp.as_ptr().cast());
-        let out: [u16; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u16, 8, uint16x4x2_t, vst1_u16_x2, vld1_u16_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u16_x3() {
-        let vals: [u16; 12] = crate::array::from_fn(|i| i as u16);
-        let a: uint16x4x3_t = transmute(vals);
-        let mut tmp = [0_u16; 12];
-        vst1_u16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint16x4x3_t = vld1_u16_x3(tmp.as_ptr().cast());
-        let out: [u16; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u16, 12, uint16x4x3_t, vst1_u16_x3, vld1_u16_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u16_x4() {
-        let vals: [u16; 16] = crate::array::from_fn(|i| i as u16);
-        let a: uint16x4x4_t = transmute(vals);
-        let mut tmp = [0_u16; 16];
-        vst1_u16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint16x4x4_t = vld1_u16_x4(tmp.as_ptr().cast());
-        let out: [u16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u16, 16, uint16x4x4_t, vst1_u16_x4, vld1_u16_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u16_x2() {
-        let vals: [u16; 16] = crate::array::from_fn(|i| i as u16);
-        let a: uint16x8x2_t = transmute(vals);
-        let mut tmp = [0_u16; 16];
-        vst1q_u16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint16x8x2_t = vld1q_u16_x2(tmp.as_ptr().cast());
-        let out: [u16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u16, 16, uint16x8x2_t, vst1q_u16_x2, vld1q_u16_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u16_x3() {
-        let vals: [u16; 24] = crate::array::from_fn(|i| i as u16);
-        let a: uint16x8x3_t = transmute(vals);
-        let mut tmp = [0_u16; 24];
-        vst1q_u16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint16x8x3_t = vld1q_u16_x3(tmp.as_ptr().cast());
-        let out: [u16; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u16, 24, uint16x8x3_t, vst1q_u16_x3, vld1q_u16_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u16_x4() {
-        let vals: [u16; 32] = crate::array::from_fn(|i| i as u16);
-        let a: uint16x8x4_t = transmute(vals);
-        let mut tmp = [0_u16; 32];
-        vst1q_u16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint16x8x4_t = vld1q_u16_x4(tmp.as_ptr().cast());
-        let out: [u16; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u16, 32, uint16x8x4_t, vst1q_u16_x4, vld1q_u16_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u32_x2() {
-        let vals: [u32; 4] = crate::array::from_fn(|i| i as u32);
-        let a: uint32x2x2_t = transmute(vals);
-        let mut tmp = [0_u32; 4];
-        vst1_u32_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint32x2x2_t = vld1_u32_x2(tmp.as_ptr().cast());
-        let out: [u32; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u32, 4, uint32x2x2_t, vst1_u32_x2, vld1_u32_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u32_x3() {
-        let vals: [u32; 6] = crate::array::from_fn(|i| i as u32);
-        let a: uint32x2x3_t = transmute(vals);
-        let mut tmp = [0_u32; 6];
-        vst1_u32_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint32x2x3_t = vld1_u32_x3(tmp.as_ptr().cast());
-        let out: [u32; 6] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u32, 6, uint32x2x3_t, vst1_u32_x3, vld1_u32_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u32_x4() {
-        let vals: [u32; 8] = crate::array::from_fn(|i| i as u32);
-        let a: uint32x2x4_t = transmute(vals);
-        let mut tmp = [0_u32; 8];
-        vst1_u32_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint32x2x4_t = vld1_u32_x4(tmp.as_ptr().cast());
-        let out: [u32; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u32, 8, uint32x2x4_t, vst1_u32_x4, vld1_u32_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u32_x2() {
-        let vals: [u32; 8] = crate::array::from_fn(|i| i as u32);
-        let a: uint32x4x2_t = transmute(vals);
-        let mut tmp = [0_u32; 8];
-        vst1q_u32_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint32x4x2_t = vld1q_u32_x2(tmp.as_ptr().cast());
-        let out: [u32; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u32, 8, uint32x4x2_t, vst1q_u32_x2, vld1q_u32_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u32_x3() {
-        let vals: [u32; 12] = crate::array::from_fn(|i| i as u32);
-        let a: uint32x4x3_t = transmute(vals);
-        let mut tmp = [0_u32; 12];
-        vst1q_u32_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint32x4x3_t = vld1q_u32_x3(tmp.as_ptr().cast());
-        let out: [u32; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u32, 12, uint32x4x3_t, vst1q_u32_x3, vld1q_u32_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u32_x4() {
-        let vals: [u32; 16] = crate::array::from_fn(|i| i as u32);
-        let a: uint32x4x4_t = transmute(vals);
-        let mut tmp = [0_u32; 16];
-        vst1q_u32_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint32x4x4_t = vld1q_u32_x4(tmp.as_ptr().cast());
-        let out: [u32; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u32, 16, uint32x4x4_t, vst1q_u32_x4, vld1q_u32_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u64_x2() {
-        let vals: [u64; 2] = crate::array::from_fn(|i| i as u64);
-        let a: uint64x1x2_t = transmute(vals);
-        let mut tmp = [0_u64; 2];
-        vst1_u64_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint64x1x2_t = vld1_u64_x2(tmp.as_ptr().cast());
-        let out: [u64; 2] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u64, 2, uint64x1x2_t, vst1_u64_x2, vld1_u64_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u64_x3() {
-        let vals: [u64; 3] = crate::array::from_fn(|i| i as u64);
-        let a: uint64x1x3_t = transmute(vals);
-        let mut tmp = [0_u64; 3];
-        vst1_u64_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint64x1x3_t = vld1_u64_x3(tmp.as_ptr().cast());
-        let out: [u64; 3] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u64, 3, uint64x1x3_t, vst1_u64_x3, vld1_u64_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_u64_x4() {
-        let vals: [u64; 4] = crate::array::from_fn(|i| i as u64);
-        let a: uint64x1x4_t = transmute(vals);
-        let mut tmp = [0_u64; 4];
-        vst1_u64_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint64x1x4_t = vld1_u64_x4(tmp.as_ptr().cast());
-        let out: [u64; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u64, 4, uint64x1x4_t, vst1_u64_x4, vld1_u64_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u64_x2() {
-        let vals: [u64; 4] = crate::array::from_fn(|i| i as u64);
-        let a: uint64x2x2_t = transmute(vals);
-        let mut tmp = [0_u64; 4];
-        vst1q_u64_x2(tmp.as_mut_ptr().cast(), a);
-        let r: uint64x2x2_t = vld1q_u64_x2(tmp.as_ptr().cast());
-        let out: [u64; 4] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u64, 4, uint64x2x2_t, vst1q_u64_x2, vld1q_u64_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u64_x3() {
-        let vals: [u64; 6] = crate::array::from_fn(|i| i as u64);
-        let a: uint64x2x3_t = transmute(vals);
-        let mut tmp = [0_u64; 6];
-        vst1q_u64_x3(tmp.as_mut_ptr().cast(), a);
-        let r: uint64x2x3_t = vld1q_u64_x3(tmp.as_ptr().cast());
-        let out: [u64; 6] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u64, 6, uint64x2x3_t, vst1q_u64_x3, vld1q_u64_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_u64_x4() {
-        let vals: [u64; 8] = crate::array::from_fn(|i| i as u64);
-        let a: uint64x2x4_t = transmute(vals);
-        let mut tmp = [0_u64; 8];
-        vst1q_u64_x4(tmp.as_mut_ptr().cast(), a);
-        let r: uint64x2x4_t = vld1q_u64_x4(tmp.as_ptr().cast());
-        let out: [u64; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(u64, 8, uint64x2x4_t, vst1q_u64_x4, vld1q_u64_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_p8_x2() {
-        let vals: [p8; 16] = crate::array::from_fn(|i| i as p8);
-        let a: poly8x8x2_t = transmute(vals);
-        let mut tmp = [0 as p8; 16];
-        vst1_p8_x2(tmp.as_mut_ptr().cast(), a);
-        let r: poly8x8x2_t = vld1_p8_x2(tmp.as_ptr().cast());
-        let out: [p8; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p8, 16, poly8x8x2_t, vst1_p8_x2, vld1_p8_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_p8_x3() {
-        let vals: [p8; 24] = crate::array::from_fn(|i| i as p8);
-        let a: poly8x8x3_t = transmute(vals);
-        let mut tmp = [0 as p8; 24];
-        vst1_p8_x3(tmp.as_mut_ptr().cast(), a);
-        let r: poly8x8x3_t = vld1_p8_x3(tmp.as_ptr().cast());
-        let out: [p8; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p8, 24, poly8x8x3_t, vst1_p8_x3, vld1_p8_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_p8_x4() {
-        let vals: [p8; 32] = crate::array::from_fn(|i| i as p8);
-        let a: poly8x8x4_t = transmute(vals);
-        let mut tmp = [0 as p8; 32];
-        vst1_p8_x4(tmp.as_mut_ptr().cast(), a);
-        let r: poly8x8x4_t = vld1_p8_x4(tmp.as_ptr().cast());
-        let out: [p8; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p8, 32, poly8x8x4_t, vst1_p8_x4, vld1_p8_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_p8_x2() {
-        let vals: [p8; 32] = crate::array::from_fn(|i| i as p8);
-        let a: poly8x16x2_t = transmute(vals);
-        let mut tmp = [0 as p8; 32];
-        vst1q_p8_x2(tmp.as_mut_ptr().cast(), a);
-        let r: poly8x16x2_t = vld1q_p8_x2(tmp.as_ptr().cast());
-        let out: [p8; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p8, 32, poly8x16x2_t, vst1q_p8_x2, vld1q_p8_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_p8_x3() {
-        let vals: [p8; 48] = crate::array::from_fn(|i| i as p8);
-        let a: poly8x16x3_t = transmute(vals);
-        let mut tmp = [0 as p8; 48];
-        vst1q_p8_x3(tmp.as_mut_ptr().cast(), a);
-        let r: poly8x16x3_t = vld1q_p8_x3(tmp.as_ptr().cast());
-        let out: [p8; 48] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p8, 48, poly8x16x3_t, vst1q_p8_x3, vld1q_p8_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_p8_x4() {
-        let vals: [p8; 64] = crate::array::from_fn(|i| i as p8);
-        let a: poly8x16x4_t = transmute(vals);
-        let mut tmp = [0 as p8; 64];
-        vst1q_p8_x4(tmp.as_mut_ptr().cast(), a);
-        let r: poly8x16x4_t = vld1q_p8_x4(tmp.as_ptr().cast());
-        let out: [p8; 64] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p8, 64, poly8x16x4_t, vst1q_p8_x4, vld1q_p8_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_p16_x2() {
-        let vals: [p16; 8] = crate::array::from_fn(|i| i as p16);
-        let a: poly16x4x2_t = transmute(vals);
-        let mut tmp = [0 as p16; 8];
-        vst1_p16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: poly16x4x2_t = vld1_p16_x2(tmp.as_ptr().cast());
-        let out: [p16; 8] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p16, 8, poly16x4x2_t, vst1_p16_x2, vld1_p16_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_p16_x3() {
-        let vals: [p16; 12] = crate::array::from_fn(|i| i as p16);
-        let a: poly16x4x3_t = transmute(vals);
-        let mut tmp = [0 as p16; 12];
-        vst1_p16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: poly16x4x3_t = vld1_p16_x3(tmp.as_ptr().cast());
-        let out: [p16; 12] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p16, 12, poly16x4x3_t, vst1_p16_x3, vld1_p16_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1_p16_x4() {
-        let vals: [p16; 16] = crate::array::from_fn(|i| i as p16);
-        let a: poly16x4x4_t = transmute(vals);
-        let mut tmp = [0 as p16; 16];
-        vst1_p16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: poly16x4x4_t = vld1_p16_x4(tmp.as_ptr().cast());
-        let out: [p16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p16, 16, poly16x4x4_t, vst1_p16_x4, vld1_p16_x4);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_p16_x2() {
-        let vals: [p16; 16] = crate::array::from_fn(|i| i as p16);
-        let a: poly16x8x2_t = transmute(vals);
-        let mut tmp = [0 as p16; 16];
-        vst1q_p16_x2(tmp.as_mut_ptr().cast(), a);
-        let r: poly16x8x2_t = vld1q_p16_x2(tmp.as_ptr().cast());
-        let out: [p16; 16] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p16, 16, poly16x8x2_t, vst1q_p16_x2, vld1q_p16_x2);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_p16_x3() {
-        let vals: [p16; 24] = crate::array::from_fn(|i| i as p16);
-        let a: poly16x8x3_t = transmute(vals);
-        let mut tmp = [0 as p16; 24];
-        vst1q_p16_x3(tmp.as_mut_ptr().cast(), a);
-        let r: poly16x8x3_t = vld1q_p16_x3(tmp.as_ptr().cast());
-        let out: [p16; 24] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p16, 24, poly16x8x3_t, vst1q_p16_x3, vld1q_p16_x3);
     }
 
     #[simd_test(enable = "neon")]
     unsafe fn test_vld1q_p16_x4() {
-        let vals: [p16; 32] = crate::array::from_fn(|i| i as p16);
-        let a: poly16x8x4_t = transmute(vals);
-        let mut tmp = [0 as p16; 32];
-        vst1q_p16_x4(tmp.as_mut_ptr().cast(), a);
-        let r: poly16x8x4_t = vld1q_p16_x4(tmp.as_ptr().cast());
-        let out: [p16; 32] = transmute(r);
-        assert_eq!(out, vals);
+        wide_store_load_roundtrip!(p16, 32, poly16x8x4_t, vst1q_p16_x4, vld1q_p16_x4);
     }
 }
 

From 5787838c05d0f2805ca9562fdab340f6a33dd15a Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Sun, 1 Feb 2026 14:17:35 +0100
Subject: [PATCH 4/4] use more capable macro for wide store/load roundtrip
 tests

---
 crates/core_arch/src/aarch64/neon/mod.rs | 470 ++++++-----------------
 1 file changed, 109 insertions(+), 361 deletions(-)

diff --git a/crates/core_arch/src/aarch64/neon/mod.rs b/crates/core_arch/src/aarch64/neon/mod.rs
index ee27bef973..580f203ef0 100644
--- a/crates/core_arch/src/aarch64/neon/mod.rs
+++ b/crates/core_arch/src/aarch64/neon/mod.rs
@@ -1006,400 +1006,148 @@ mod tests {
         };
     }
 
-    #[simd_test(enable = "neon,fp16")]
-    #[cfg(not(target_arch = "arm64ec"))]
-    unsafe fn test_vld1_f16_x2() {
-        wide_store_load_roundtrip!(f16, 8, float16x4x2_t, vst1_f16_x2, vld1_f16_x2);
-    }
-
-    #[simd_test(enable = "neon,fp16")]
-    #[cfg(not(target_arch = "arm64ec"))]
-    unsafe fn test_vld1_f16_x3() {
-        wide_store_load_roundtrip!(f16, 12, float16x4x3_t, vst1_f16_x3, vld1_f16_x3);
-    }
-
-    #[simd_test(enable = "neon,fp16")]
-    #[cfg(not(target_arch = "arm64ec"))]
-    unsafe fn test_vld1_f16_x4() {
-        wide_store_load_roundtrip!(f16, 16, float16x4x4_t, vst1_f16_x4, vld1_f16_x4);
-    }
-
-    #[simd_test(enable = "neon,fp16")]
-    #[cfg(not(target_arch = "arm64ec"))]
-    unsafe fn test_vld1q_f16_x2() {
-        wide_store_load_roundtrip!(f16, 16, float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2);
-    }
-
-    #[simd_test(enable = "neon,fp16")]
-    #[cfg(not(target_arch = "arm64ec"))]
-    unsafe fn test_vld1q_f16_x3() {
-        wide_store_load_roundtrip!(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
-    }
-
-    #[simd_test(enable = "neon,fp16")]
-    #[cfg(not(target_arch = "arm64ec"))]
-    unsafe fn test_vld1q_f16_x4() {
-        wide_store_load_roundtrip!(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_f32_x2() {
-        wide_store_load_roundtrip!(f32, 4, float32x2x2_t, vst1_f32_x2, vld1_f32_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_f32_x3() {
-        wide_store_load_roundtrip!(f32, 6, float32x2x3_t, vst1_f32_x3, vld1_f32_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_f32_x4() {
-        wide_store_load_roundtrip!(f32, 8, float32x2x4_t, vst1_f32_x4, vld1_f32_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32_x2() {
-        wide_store_load_roundtrip!(f32, 8, float32x4x2_t, vst1q_f32_x2, vld1q_f32_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32_x3() {
-        wide_store_load_roundtrip!(f32, 12, float32x4x3_t, vst1q_f32_x3, vld1q_f32_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_f32_x4() {
-        wide_store_load_roundtrip!(f32, 16, float32x4x4_t, vst1q_f32_x4, vld1q_f32_x4);
-    }
-
-    #[simd_test(enable = "neon,aes")]
-    unsafe fn test_vld1_p64_x2() {
-        wide_store_load_roundtrip!(p64, 2, poly64x1x2_t, vst1_p64_x2, vld1_p64_x2);
-    }
-
-    #[simd_test(enable = "neon,aes")]
-    unsafe fn test_vld1_p64_x3() {
-        wide_store_load_roundtrip!(p64, 3, poly64x1x3_t, vst1_p64_x3, vld1_p64_x3);
-    }
-
-    #[simd_test(enable = "neon,aes")]
-    unsafe fn test_vld1_p64_x4() {
-        wide_store_load_roundtrip!(p64, 4, poly64x1x4_t, vst1_p64_x4, vld1_p64_x4);
-    }
-
-    #[simd_test(enable = "neon,aes")]
-    unsafe fn test_vld1q_p64_x2() {
-        wide_store_load_roundtrip!(p64, 4, poly64x2x2_t, vst1q_p64_x2, vld1q_p64_x2);
-    }
-
-    #[simd_test(enable = "neon,aes")]
-    unsafe fn test_vld1q_p64_x3() {
-        wide_store_load_roundtrip!(p64, 6, poly64x2x3_t, vst1q_p64_x3, vld1q_p64_x3);
-    }
-
-    #[simd_test(enable = "neon,aes")]
-    unsafe fn test_vld1q_p64_x4() {
-        wide_store_load_roundtrip!(p64, 8, poly64x2x4_t, vst1q_p64_x4, vld1q_p64_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s8_x2() {
-        wide_store_load_roundtrip!(i8, 16, int8x8x2_t, vst1_s8_x2, vld1_s8_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s8_x3() {
-        wide_store_load_roundtrip!(i8, 24, int8x8x3_t, vst1_s8_x3, vld1_s8_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s8_x4() {
-        wide_store_load_roundtrip!(i8, 32, int8x8x4_t, vst1_s8_x4, vld1_s8_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8_x2() {
-        wide_store_load_roundtrip!(i8, 32, int8x16x2_t, vst1q_s8_x2, vld1q_s8_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8_x3() {
-        wide_store_load_roundtrip!(i8, 48, int8x16x3_t, vst1q_s8_x3, vld1q_s8_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s8_x4() {
-        wide_store_load_roundtrip!(i8, 64, int8x16x4_t, vst1q_s8_x4, vld1q_s8_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s16_x2() {
-        wide_store_load_roundtrip!(i16, 8, int16x4x2_t, vst1_s16_x2, vld1_s16_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s16_x3() {
-        wide_store_load_roundtrip!(i16, 12, int16x4x3_t, vst1_s16_x3, vld1_s16_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s16_x4() {
-        wide_store_load_roundtrip!(i16, 16, int16x4x4_t, vst1_s16_x4, vld1_s16_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s16_x2() {
-        wide_store_load_roundtrip!(i16, 16, int16x8x2_t, vst1q_s16_x2, vld1q_s16_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s16_x3() {
-        wide_store_load_roundtrip!(i16, 24, int16x8x3_t, vst1q_s16_x3, vld1q_s16_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s16_x4() {
-        wide_store_load_roundtrip!(i16, 32, int16x8x4_t, vst1q_s16_x4, vld1q_s16_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s32_x2() {
-        wide_store_load_roundtrip!(i32, 4, int32x2x2_t, vst1_s32_x2, vld1_s32_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s32_x3() {
-        wide_store_load_roundtrip!(i32, 6, int32x2x3_t, vst1_s32_x3, vld1_s32_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s32_x4() {
-        wide_store_load_roundtrip!(i32, 8, int32x2x4_t, vst1_s32_x4, vld1_s32_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32_x2() {
-        wide_store_load_roundtrip!(i32, 8, int32x4x2_t, vst1q_s32_x2, vld1q_s32_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32_x3() {
-        wide_store_load_roundtrip!(i32, 12, int32x4x3_t, vst1q_s32_x3, vld1q_s32_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s32_x4() {
-        wide_store_load_roundtrip!(i32, 16, int32x4x4_t, vst1q_s32_x4, vld1q_s32_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s64_x2() {
-        wide_store_load_roundtrip!(i64, 2, int64x1x2_t, vst1_s64_x2, vld1_s64_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s64_x3() {
-        wide_store_load_roundtrip!(i64, 3, int64x1x3_t, vst1_s64_x3, vld1_s64_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_s64_x4() {
-        wide_store_load_roundtrip!(i64, 4, int64x1x4_t, vst1_s64_x4, vld1_s64_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s64_x2() {
-        wide_store_load_roundtrip!(i64, 4, int64x2x2_t, vst1q_s64_x2, vld1q_s64_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s64_x3() {
-        wide_store_load_roundtrip!(i64, 6, int64x2x3_t, vst1q_s64_x3, vld1q_s64_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_s64_x4() {
-        wide_store_load_roundtrip!(i64, 8, int64x2x4_t, vst1q_s64_x4, vld1q_s64_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u8_x2() {
-        wide_store_load_roundtrip!(u8, 16, uint8x8x2_t, vst1_u8_x2, vld1_u8_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u8_x3() {
-        wide_store_load_roundtrip!(u8, 24, uint8x8x3_t, vst1_u8_x3, vld1_u8_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u8_x4() {
-        wide_store_load_roundtrip!(u8, 32, uint8x8x4_t, vst1_u8_x4, vld1_u8_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8_x2() {
-        wide_store_load_roundtrip!(u8, 32, uint8x16x2_t, vst1q_u8_x2, vld1q_u8_x2);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8_x3() {
-        wide_store_load_roundtrip!(u8, 48, uint8x16x3_t, vst1q_u8_x3, vld1q_u8_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u8_x4() {
-        wide_store_load_roundtrip!(u8, 64, uint8x16x4_t, vst1q_u8_x4, vld1q_u8_x4);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u16_x2() {
-        wide_store_load_roundtrip!(u16, 8, uint16x4x2_t, vst1_u16_x2, vld1_u16_x2);
+    macro_rules! wide_store_load_roundtrip_fp16 {
+        ($( $name:ident $args:tt);* $(;)?) => {
+            $(
+                #[simd_test(enable = "neon,fp16")]
+                #[cfg(not(target_arch = "arm64ec"))]
+                unsafe fn $name() {
+                    wide_store_load_roundtrip! $args;
+                }
+            )*
+        };
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u16_x3() {
-        wide_store_load_roundtrip!(u16, 12, uint16x4x3_t, vst1_u16_x3, vld1_u16_x3);
-    }
+    wide_store_load_roundtrip_fp16! {
+        test_vld1_f16_x2(f16, 8, float16x4x2_t, vst1_f16_x2, vld1_f16_x2);
+        test_vld1_f16_x3(f16, 12, float16x4x3_t, vst1_f16_x3, vld1_f16_x3);
+        test_vld1_f16_x4(f16, 16, float16x4x4_t, vst1_f16_x4, vld1_f16_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u16_x4() {
-        wide_store_load_roundtrip!(u16, 16, uint16x4x4_t, vst1_u16_x4, vld1_u16_x4);
+        test_vld1q_f16_x2(f16, 16, float16x8x2_t, vst1q_f16_x2, vld1q_f16_x2);
+        test_vld1q_f16_x3(f16, 24, float16x8x3_t, vst1q_f16_x3, vld1q_f16_x3);
+        test_vld1q_f16_x4(f16, 32, float16x8x4_t, vst1q_f16_x4, vld1q_f16_x4);
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u16_x2() {
-        wide_store_load_roundtrip!(u16, 16, uint16x8x2_t, vst1q_u16_x2, vld1q_u16_x2);
+    macro_rules! wide_store_load_roundtrip_aes {
+        ($( $name:ident $args:tt);* $(;)?) => {
+            $(
+                #[simd_test(enable = "neon,aes")]
+                unsafe fn $name() {
+                    wide_store_load_roundtrip! $args;
+                }
+            )*
+        };
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u16_x3() {
-        wide_store_load_roundtrip!(u16, 24, uint16x8x3_t, vst1q_u16_x3, vld1q_u16_x3);
-    }
+    wide_store_load_roundtrip_aes! {
+        test_vld1_p64_x2(p64, 2, poly64x1x2_t, vst1_p64_x2, vld1_p64_x2);
+        test_vld1_p64_x3(p64, 3, poly64x1x3_t, vst1_p64_x3, vld1_p64_x3);
+        test_vld1_p64_x4(p64, 4, poly64x1x4_t, vst1_p64_x4, vld1_p64_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u16_x4() {
-        wide_store_load_roundtrip!(u16, 32, uint16x8x4_t, vst1q_u16_x4, vld1q_u16_x4);
+        test_vld1q_p64_x2(p64, 4, poly64x2x2_t, vst1q_p64_x2, vld1q_p64_x2);
+        test_vld1q_p64_x3(p64, 6, poly64x2x3_t, vst1q_p64_x3, vld1q_p64_x3);
+        test_vld1q_p64_x4(p64, 8, poly64x2x4_t, vst1q_p64_x4, vld1q_p64_x4);
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u32_x2() {
-        wide_store_load_roundtrip!(u32, 4, uint32x2x2_t, vst1_u32_x2, vld1_u32_x2);
+    macro_rules! wide_store_load_roundtrip_neon {
+        ($( $name:ident $args:tt);* $(;)?) => {
+            $(
+                #[simd_test(enable = "neon")]
+                unsafe fn $name() {
+                    wide_store_load_roundtrip! $args;
+                }
+            )*
+        };
     }
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u32_x3() {
-        wide_store_load_roundtrip!(u32, 6, uint32x2x3_t, vst1_u32_x3, vld1_u32_x3);
-    }
+    wide_store_load_roundtrip_neon! {
+        test_vld1_f32_x2(f32, 4, float32x2x2_t, vst1_f32_x2, vld1_f32_x2);
+        test_vld1_f32_x3(f32, 6, float32x2x3_t, vst1_f32_x3, vld1_f32_x3);
+        test_vld1_f32_x4(f32, 8, float32x2x4_t, vst1_f32_x4, vld1_f32_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u32_x4() {
-        wide_store_load_roundtrip!(u32, 8, uint32x2x4_t, vst1_u32_x4, vld1_u32_x4);
-    }
+        test_vld1q_f32_x2(f32, 8, float32x4x2_t, vst1q_f32_x2, vld1q_f32_x2);
+        test_vld1q_f32_x3(f32, 12, float32x4x3_t, vst1q_f32_x3, vld1q_f32_x3);
+        test_vld1q_f32_x4(f32, 16, float32x4x4_t, vst1q_f32_x4, vld1q_f32_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32_x2() {
-        wide_store_load_roundtrip!(u32, 8, uint32x4x2_t, vst1q_u32_x2, vld1q_u32_x2);
-    }
+        test_vld1_s8_x2(i8, 16, int8x8x2_t, vst1_s8_x2, vld1_s8_x2);
+        test_vld1_s8_x3(i8, 24, int8x8x3_t, vst1_s8_x3, vld1_s8_x3);
+        test_vld1_s8_x4(i8, 32, int8x8x4_t, vst1_s8_x4, vld1_s8_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32_x3() {
-        wide_store_load_roundtrip!(u32, 12, uint32x4x3_t, vst1q_u32_x3, vld1q_u32_x3);
-    }
+        test_vld1q_s8_x2(i8, 32, int8x16x2_t, vst1q_s8_x2, vld1q_s8_x2);
+        test_vld1q_s8_x3(i8, 48, int8x16x3_t, vst1q_s8_x3, vld1q_s8_x3);
+        test_vld1q_s8_x4(i8, 64, int8x16x4_t, vst1q_s8_x4, vld1q_s8_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u32_x4() {
-        wide_store_load_roundtrip!(u32, 16, uint32x4x4_t, vst1q_u32_x4, vld1q_u32_x4);
-    }
+        test_vld1_s16_x2(i16, 8, int16x4x2_t, vst1_s16_x2, vld1_s16_x2);
+        test_vld1_s16_x3(i16, 12, int16x4x3_t, vst1_s16_x3, vld1_s16_x3);
+        test_vld1_s16_x4(i16, 16, int16x4x4_t, vst1_s16_x4, vld1_s16_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u64_x2() {
-        wide_store_load_roundtrip!(u64, 2, uint64x1x2_t, vst1_u64_x2, vld1_u64_x2);
-    }
+        test_vld1q_s16_x2(i16, 16, int16x8x2_t, vst1q_s16_x2, vld1q_s16_x2);
+        test_vld1q_s16_x3(i16, 24, int16x8x3_t, vst1q_s16_x3, vld1q_s16_x3);
+        test_vld1q_s16_x4(i16, 32, int16x8x4_t, vst1q_s16_x4, vld1q_s16_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u64_x3() {
-        wide_store_load_roundtrip!(u64, 3, uint64x1x3_t, vst1_u64_x3, vld1_u64_x3);
-    }
+        test_vld1_s32_x2(i32, 4, int32x2x2_t, vst1_s32_x2, vld1_s32_x2);
+        test_vld1_s32_x3(i32, 6, int32x2x3_t, vst1_s32_x3, vld1_s32_x3);
+        test_vld1_s32_x4(i32, 8, int32x2x4_t, vst1_s32_x4, vld1_s32_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_u64_x4() {
-        wide_store_load_roundtrip!(u64, 4, uint64x1x4_t, vst1_u64_x4, vld1_u64_x4);
-    }
+        test_vld1q_s32_x2(i32, 8, int32x4x2_t, vst1q_s32_x2, vld1q_s32_x2);
+        test_vld1q_s32_x3(i32, 12, int32x4x3_t, vst1q_s32_x3, vld1q_s32_x3);
+        test_vld1q_s32_x4(i32, 16, int32x4x4_t, vst1q_s32_x4, vld1q_s32_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u64_x2() {
-        wide_store_load_roundtrip!(u64, 4, uint64x2x2_t, vst1q_u64_x2, vld1q_u64_x2);
-    }
+        test_vld1_s64_x2(i64, 2, int64x1x2_t, vst1_s64_x2, vld1_s64_x2);
+        test_vld1_s64_x3(i64, 3, int64x1x3_t, vst1_s64_x3, vld1_s64_x3);
+        test_vld1_s64_x4(i64, 4, int64x1x4_t, vst1_s64_x4, vld1_s64_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u64_x3() {
-        wide_store_load_roundtrip!(u64, 6, uint64x2x3_t, vst1q_u64_x3, vld1q_u64_x3);
-    }
+        test_vld1q_s64_x2(i64, 4, int64x2x2_t, vst1q_s64_x2, vld1q_s64_x2);
+        test_vld1q_s64_x3(i64, 6, int64x2x3_t, vst1q_s64_x3, vld1q_s64_x3);
+        test_vld1q_s64_x4(i64, 8, int64x2x4_t, vst1q_s64_x4, vld1q_s64_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_u64_x4() {
-        wide_store_load_roundtrip!(u64, 8, uint64x2x4_t, vst1q_u64_x4, vld1q_u64_x4);
-    }
+        test_vld1_u8_x2(u8, 16, uint8x8x2_t, vst1_u8_x2, vld1_u8_x2);
+        test_vld1_u8_x3(u8, 24, uint8x8x3_t, vst1_u8_x3, vld1_u8_x3);
+        test_vld1_u8_x4(u8, 32, uint8x8x4_t, vst1_u8_x4, vld1_u8_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p8_x2() {
-        wide_store_load_roundtrip!(p8, 16, poly8x8x2_t, vst1_p8_x2, vld1_p8_x2);
-    }
+        test_vld1q_u8_x2(u8, 32, uint8x16x2_t, vst1q_u8_x2, vld1q_u8_x2);
+        test_vld1q_u8_x3(u8, 48, uint8x16x3_t, vst1q_u8_x3, vld1q_u8_x3);
+        test_vld1q_u8_x4(u8, 64, uint8x16x4_t, vst1q_u8_x4, vld1q_u8_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p8_x3() {
-        wide_store_load_roundtrip!(p8, 24, poly8x8x3_t, vst1_p8_x3, vld1_p8_x3);
-    }
+        test_vld1_u16_x2(u16, 8, uint16x4x2_t, vst1_u16_x2, vld1_u16_x2);
+        test_vld1_u16_x3(u16, 12, uint16x4x3_t, vst1_u16_x3, vld1_u16_x3);
+        test_vld1_u16_x4(u16, 16, uint16x4x4_t, vst1_u16_x4, vld1_u16_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p8_x4() {
-        wide_store_load_roundtrip!(p8, 32, poly8x8x4_t, vst1_p8_x4, vld1_p8_x4);
-    }
+        test_vld1q_u16_x2(u16, 16, uint16x8x2_t, vst1q_u16_x2, vld1q_u16_x2);
+        test_vld1q_u16_x3(u16, 24, uint16x8x3_t, vst1q_u16_x3, vld1q_u16_x3);
+        test_vld1q_u16_x4(u16, 32, uint16x8x4_t, vst1q_u16_x4, vld1q_u16_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p8_x2() {
-        wide_store_load_roundtrip!(p8, 32, poly8x16x2_t, vst1q_p8_x2, vld1q_p8_x2);
-    }
+        test_vld1_u32_x2(u32, 4, uint32x2x2_t, vst1_u32_x2, vld1_u32_x2);
+        test_vld1_u32_x3(u32, 6, uint32x2x3_t, vst1_u32_x3, vld1_u32_x3);
+        test_vld1_u32_x4(u32, 8, uint32x2x4_t, vst1_u32_x4, vld1_u32_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p8_x3() {
-        wide_store_load_roundtrip!(p8, 48, poly8x16x3_t, vst1q_p8_x3, vld1q_p8_x3);
-    }
+        test_vld1q_u32_x2(u32, 8, uint32x4x2_t, vst1q_u32_x2, vld1q_u32_x2);
+        test_vld1q_u32_x3(u32, 12, uint32x4x3_t, vst1q_u32_x3, vld1q_u32_x3);
+        test_vld1q_u32_x4(u32, 16, uint32x4x4_t, vst1q_u32_x4, vld1q_u32_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p8_x4() {
-        wide_store_load_roundtrip!(p8, 64, poly8x16x4_t, vst1q_p8_x4, vld1q_p8_x4);
-    }
+        test_vld1_u64_x2(u64, 2, uint64x1x2_t, vst1_u64_x2, vld1_u64_x2);
+        test_vld1_u64_x3(u64, 3, uint64x1x3_t, vst1_u64_x3, vld1_u64_x3);
+        test_vld1_u64_x4(u64, 4, uint64x1x4_t, vst1_u64_x4, vld1_u64_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p16_x2() {
-        wide_store_load_roundtrip!(p16, 8, poly16x4x2_t, vst1_p16_x2, vld1_p16_x2);
-    }
+        test_vld1q_u64_x2(u64, 4, uint64x2x2_t, vst1q_u64_x2, vld1q_u64_x2);
+        test_vld1q_u64_x3(u64, 6, uint64x2x3_t, vst1q_u64_x3, vld1q_u64_x3);
+        test_vld1q_u64_x4(u64, 8, uint64x2x4_t, vst1q_u64_x4, vld1q_u64_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p16_x3() {
-        wide_store_load_roundtrip!(p16, 12, poly16x4x3_t, vst1_p16_x3, vld1_p16_x3);
-    }
+        test_vld1_p8_x2(p8, 16, poly8x8x2_t, vst1_p8_x2, vld1_p8_x2);
+        test_vld1_p8_x3(p8, 24, poly8x8x3_t, vst1_p8_x3, vld1_p8_x3);
+        test_vld1_p8_x4(p8, 32, poly8x8x4_t, vst1_p8_x4, vld1_p8_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1_p16_x4() {
-        wide_store_load_roundtrip!(p16, 16, poly16x4x4_t, vst1_p16_x4, vld1_p16_x4);
-    }
+        test_vld1q_p8_x2(p8, 32, poly8x16x2_t, vst1q_p8_x2, vld1q_p8_x2);
+        test_vld1q_p8_x3(p8, 48, poly8x16x3_t, vst1q_p8_x3, vld1q_p8_x3);
+        test_vld1q_p8_x4(p8, 64, poly8x16x4_t, vst1q_p8_x4, vld1q_p8_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p16_x2() {
-        wide_store_load_roundtrip!(p16, 16, poly16x8x2_t, vst1q_p16_x2, vld1q_p16_x2);
-    }
+        test_vld1_p16_x2(p16, 8, poly16x4x2_t, vst1_p16_x2, vld1_p16_x2);
+        test_vld1_p16_x3(p16, 12, poly16x4x3_t, vst1_p16_x3, vld1_p16_x3);
+        test_vld1_p16_x4(p16, 16, poly16x4x4_t, vst1_p16_x4, vld1_p16_x4);
 
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p16_x3() {
-        wide_store_load_roundtrip!(p16, 24, poly16x8x3_t, vst1q_p16_x3, vld1q_p16_x3);
-    }
-
-    #[simd_test(enable = "neon")]
-    unsafe fn test_vld1q_p16_x4() {
-        wide_store_load_roundtrip!(p16, 32, poly16x8x4_t, vst1q_p16_x4, vld1q_p16_x4);
+        test_vld1q_p16_x2(p16, 16, poly16x8x2_t, vst1q_p16_x2, vld1q_p16_x2);
+        test_vld1q_p16_x3(p16, 24, poly16x8x3_t, vst1q_p16_x3, vld1q_p16_x3);
+        test_vld1q_p16_x4(p16, 32, poly16x8x4_t, vst1q_p16_x4, vld1q_p16_x4);
     }
 }