From 263460aed560086117787484f80ffefd23290e8f Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 30 Oct 2025 09:39:22 +0000 Subject: [PATCH 1/6] fdct: don't mark AVX helper functions as unsafe fn --- src/avx2/fdct.rs | 40 ++++++++++++++++++++++++---------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index b7caa38..615963b 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -65,9 +65,10 @@ pub fn fdct_avx2(data: &mut [i16; 64]) { #[target_feature(enable = "avx2")] unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] - #[inline(always)] - unsafe fn PW_F130_F054_MF130_F054() -> __m256i { + #[inline] + fn PW_F130_F054_MF130_F054() -> __m256i { _mm256_set_epi16( F_0_541, F_0_541 - F_1_847, @@ -88,9 +89,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { ) } + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] - #[inline(always)] - unsafe fn PW_MF078_F117_F078_F117() -> __m256i { + #[inline] + fn PW_MF078_F117_F078_F117() -> __m256i { _mm256_set_epi16( F_1_175, F_1_175 - F_0_390, @@ -111,9 +113,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { ) } + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] - #[inline(always)] - unsafe fn PW_MF060_MF089_MF050_MF256() -> __m256i { + #[inline] + fn PW_MF060_MF089_MF050_MF256() -> __m256i { _mm256_set_epi16( -F_2_562, F_2_053 - F_2_562, @@ -134,9 +137,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { ) } + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] - #[inline(always)] - unsafe fn PW_F050_MF256_F060_MF089() -> __m256i { + #[inline] + fn PW_F050_MF256_F060_MF089() -> __m256i { _mm256_set_epi16( -F_0_899, F_1_501 - F_0_899, @@ -157,9 +161,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { ) } + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] - #[inline(always)] - unsafe fn PD_DESCALE_P(first_pass: bool) -> __m256i { + #[inline] + fn PD_DESCALE_P(first_pass: bool) -> __m256i { if first_pass { _mm256_set_epi32( 1 << (DESCALE_P1 - 1), @@ -185,9 +190,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { } } + #[target_feature(enable = "avx2")] #[allow(non_snake_case)] - #[inline(always)] - unsafe fn PW_DESCALE_P2X() -> __m256i { + #[inline] + fn PW_DESCALE_P2X() -> __m256i { _mm256_set_epi32( 1 << (PASS1_BITS - 1), 1 << (PASS1_BITS - 1), @@ -201,8 +207,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { } // In-place 8x8x16-bit matrix transpose using AVX2 instructions - #[inline(always)] - unsafe fn do_transpose( + #[target_feature(enable = "avx2")] + #[inline] + fn do_transpose( i1: __m256i, i2: __m256i, i3: __m256i, @@ -244,8 +251,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { } // In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions - #[inline(always)] - unsafe fn do_dct( + #[target_feature(enable = "avx2")] + #[inline] + fn do_dct( first_pass: bool, i1: __m256i, i2: __m256i, From b1737f8372e5bc10bb95f5dd7376e2dd4894dfd5 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 30 Oct 2025 09:55:21 +0000 Subject: [PATCH 2/6] Add safe wrappers for AVX loads/stores --- src/avx2/fdct.rs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index 615963b..433ae15 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -466,3 +466,21 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { _mm256_storeu_si256(out_data.add(2), ymm6); _mm256_storeu_si256(out_data.add(3), ymm7); } + +/// Safe wrapper for an unaligned AVX load +#[target_feature(enable = "avx2")] +#[inline] +fn avx_load(input: &[i16; 16]) -> __m256i { + assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); + // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. + unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) } +} + +/// Safe wrapper for an unaligned AVX store +#[target_feature(enable = "avx2")] +#[inline] +fn avx_store(input: __m256i, output: &mut [i16; 16]) { + assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); + // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. + unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) } +} \ No newline at end of file From 3e1cd54fe243464c4d881344afbbbfcbac011dcf Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 30 Oct 2025 10:02:35 +0000 Subject: [PATCH 3/6] Use the safe loads/store wrappers inside the AVX FDCT, drop the 'unsafe fn' from it now that it contains no unsafe ops --- src/avx2/fdct.rs | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index 433ae15..a2e89b9 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -64,7 +64,7 @@ pub fn fdct_avx2(data: &mut [i16; 64]) { } #[target_feature(enable = "avx2")] -unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { +fn fdct_avx2_internal(data: &mut [i16; 64]) { #[target_feature(enable = "avx2")] #[allow(non_snake_case)] #[inline] @@ -420,12 +420,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { (t1, t2, t3, t4) } - let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); - - let ymm4 = _mm256_loadu_si256(in_data); - let ymm5 = _mm256_loadu_si256(in_data.add(1)); - let ymm6 = _mm256_loadu_si256(in_data.add(2)); - let ymm7 = _mm256_loadu_si256(in_data.add(3)); + let ymm4 = avx_load(data[0..16].try_into().unwrap()); + let ymm5 = avx_load(data[16..32].try_into().unwrap()); + let ymm6 = avx_load(data[32..48].try_into().unwrap()); + let ymm7 = avx_load(data[48..64].try_into().unwrap()); // ---- Pass 1: process rows. // ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) @@ -459,12 +457,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) { let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 - let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); - - _mm256_storeu_si256(out_data, ymm3); - _mm256_storeu_si256(out_data.add(1), ymm5); - _mm256_storeu_si256(out_data.add(2), ymm6); - _mm256_storeu_si256(out_data.add(3), ymm7); + avx_store(ymm3, &mut data[0..16].try_into().unwrap()); + avx_store(ymm5, &mut data[16..32].try_into().unwrap()); + avx_store(ymm6, &mut data[32..48].try_into().unwrap()); + avx_store(ymm7, &mut data[48..64].try_into().unwrap()); } /// Safe wrapper for an unaligned AVX load From 18ec1e603b2414b14f0ac40d7d7dd0dcbe1a0c83 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 30 Oct 2025 10:26:19 +0000 Subject: [PATCH 4/6] Roll back to last working configuration; something must be wrong with the stores --- src/avx2/fdct.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index a2e89b9..8222a58 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -457,10 +457,13 @@ fn fdct_avx2_internal(data: &mut [i16; 64]) { let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 - avx_store(ymm3, &mut data[0..16].try_into().unwrap()); - avx_store(ymm5, &mut data[16..32].try_into().unwrap()); - avx_store(ymm6, &mut data[32..48].try_into().unwrap()); - avx_store(ymm7, &mut data[48..64].try_into().unwrap()); + unsafe { + let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); + _mm256_storeu_si256(out_data, ymm3); + _mm256_storeu_si256(out_data.add(1), ymm5); + _mm256_storeu_si256(out_data.add(2), ymm6); + _mm256_storeu_si256(out_data.add(3), ymm7); + } } /// Safe wrapper for an unaligned AVX load From 68c9485661a73470ee0abf2a597cb63de8bd2257 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 30 Oct 2025 10:41:25 +0000 Subject: [PATCH 5/6] Fix subtle bug due to intermediate arrays being created by the try_into() call --- Cargo.toml | 2 +- src/avx2/fdct.rs | 25 ++++++++++++------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 28abe8f..7ecae61 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ repository = "https://github.com/vstroebel/jpeg-encoder" rust-version = "1.61" [features] -default = ["std"] +default = ["std", "simd"] simd = ["std"] std = [] diff --git a/src/avx2/fdct.rs b/src/avx2/fdct.rs index 8222a58..ec12b72 100644 --- a/src/avx2/fdct.rs +++ b/src/avx2/fdct.rs @@ -420,10 +420,10 @@ fn fdct_avx2_internal(data: &mut [i16; 64]) { (t1, t2, t3, t4) } - let ymm4 = avx_load(data[0..16].try_into().unwrap()); - let ymm5 = avx_load(data[16..32].try_into().unwrap()); - let ymm6 = avx_load(data[32..48].try_into().unwrap()); - let ymm7 = avx_load(data[48..64].try_into().unwrap()); + let ymm4 = avx_load(&data[0..16]); + let ymm5 = avx_load(&data[16..32]); + let ymm6 = avx_load(&data[32..48]); + let ymm7 = avx_load(&data[48..64]); // ---- Pass 1: process rows. // ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17) @@ -457,19 +457,17 @@ fn fdct_avx2_internal(data: &mut [i16; 64]) { let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5 let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7 - unsafe { - let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr()); - _mm256_storeu_si256(out_data, ymm3); - _mm256_storeu_si256(out_data.add(1), ymm5); - _mm256_storeu_si256(out_data.add(2), ymm6); - _mm256_storeu_si256(out_data.add(3), ymm7); - } + avx_store(ymm3, &mut data[0..16]); + avx_store(ymm5, &mut data[16..32]); + avx_store(ymm6, &mut data[32..48]); + avx_store(ymm7, &mut data[48..64]); } /// Safe wrapper for an unaligned AVX load #[target_feature(enable = "avx2")] #[inline] -fn avx_load(input: &[i16; 16]) -> __m256i { +fn avx_load(input: &[i16]) -> __m256i { + assert!(input.len() == 16); assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) } @@ -478,7 +476,8 @@ fn avx_load(input: &[i16; 16]) -> __m256i { /// Safe wrapper for an unaligned AVX store #[target_feature(enable = "avx2")] #[inline] -fn avx_store(input: __m256i, output: &mut [i16; 16]) { +fn avx_store(input: __m256i, output: &mut [i16]) { + assert!(output.len() == 16); assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>()); // SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements. unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) } From 12f8f9dd746fc41cf212c4208179800177e7db23 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Thu, 30 Oct 2025 10:49:13 +0000 Subject: [PATCH 6/6] Switch simd feature back to opt-in, it can be flipped in a subsequent PR --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 7ecae61..28abe8f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,7 @@ repository = "https://github.com/vstroebel/jpeg-encoder" rust-version = "1.61" [features] -default = ["std", "simd"] +default = ["std"] simd = ["std"] std = []