Skip to content

Commit 5082756

Browse files
committed
Add 2*GAMMA2/128 (and the corresponding Barrett constants) as exceptions for check-magic
Signed-off-by: jammychiou1 <[email protected]>
1 parent dfacb88 commit 5082756

File tree

5 files changed

+55
-17
lines changed

5 files changed

+55
-17
lines changed

dev/x86_64/src/poly_decompose_32_avx2.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,12 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
3838
unsigned int i;
3939
__m256i f, f0, f1, t;
4040
const __m256i q_bound = _mm256_set1_epi32(31 * MLDSA_GAMMA2);
41-
/* check-magic: 1025 == floor(2**22 / 4092) */
41+
/*
42+
* Barrett constant 1025 = floor(2**22 / 4092), for computing the division
43+
* a -> round-(a / 4092). While it doesn't make a difference here, using
44+
* floor() instead of round() is how we make sure 1025 / 2^22 ≲ 1 / 4092
45+
* instead of ≳. See below for more details.
46+
*/
4247
const __m256i v = _mm256_set1_epi32(1025);
4348
const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
4449
const __m256i off = _mm256_set1_epi32(127);
@@ -48,7 +53,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
4853
{
4954
f = _mm256_load_si256(&a[i]);
5055

51-
/* check-magic: 4092 == 2 * ((MLDSA_Q-1) // 32) // 128 */
5256
/*
5357
* The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
5458
* alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -68,7 +72,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
6872
* _mm256_mulhi_epu16() below.
6973
*/
7074

71-
/* check-magic: off */
7275
/*
7376
* Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
7477
* for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
@@ -77,7 +80,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
7780
* The odd-index 16-bit lanes are still all 0 after this. As such, despite
7881
* that the following steps use 32-bit lanes, the value of f1 is unaffected.
7982
*/
80-
/* check-magic: on */
8183
f1 = _mm256_mulhi_epu16(f1, v);
8284
f1 = _mm256_mulhrs_epi16(f1, shift);
8385
/* range: 0 <= f1 <= 16 */

dev/x86_64/src/poly_decompose_88_avx2.c

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
3939
unsigned int i;
4040
__m256i f, f0, f1, t;
4141
const __m256i q_bound = _mm256_set1_epi32(87 * MLDSA_GAMMA2);
42-
/* check-magic: 11275 == floor(2**24 / 1488) */
42+
/*
43+
* Barrett constant 11275 = floor(2**24 / 1488), for computing the division
44+
* a -> round-(a / 1488). While it doesn't make a difference here, using
45+
* floor() instead of round() is how we make sure 11275 / 2^24 ≲ 1 / 1488
46+
* instead of ≳. See below for more details.
47+
*/
4348
const __m256i v = _mm256_set1_epi32(11275);
4449
const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
4550
const __m256i off = _mm256_set1_epi32(127);
@@ -49,7 +54,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
4954
{
5055
f = _mm256_load_si256(&a[i]);
5156

52-
/* check-magic: 1488 == 2 * ((MLDSA_Q-1) // 88) // 128 */
5357
/*
5458
* The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
5559
* alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -69,7 +73,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
6973
* _mm256_mulhi_epu16() below.
7074
*/
7175

72-
/* check-magic: off */
7376
/*
7477
* Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
7578
* for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
@@ -78,7 +81,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
7881
* The odd-index 16-bit lanes are still all 0 after this. As such, despite
7982
* that the following steps use 32-bit lanes, the value of f1 is unaffected.
8083
*/
81-
/* check-magic: on */
8284
f1 = _mm256_mulhi_epu16(f1, v);
8385
f1 = _mm256_mulhrs_epi16(f1, shift);
8486
/* range: 0 <= f1 <= 44 */

mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,12 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
3838
unsigned int i;
3939
__m256i f, f0, f1, t;
4040
const __m256i q_bound = _mm256_set1_epi32(31 * MLDSA_GAMMA2);
41-
/* check-magic: 1025 == floor(2**22 / 4092) */
41+
/*
42+
* Barrett constant 1025 = floor(2**22 / 4092), for computing the division
43+
* a -> round-(a / 4092). While it doesn't make a difference here, using
44+
* floor() instead of round() is how we make sure 1025 / 2^22 ≲ 1 / 4092
45+
* instead of ≳. See below for more details.
46+
*/
4247
const __m256i v = _mm256_set1_epi32(1025);
4348
const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
4449
const __m256i off = _mm256_set1_epi32(127);
@@ -48,7 +53,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
4853
{
4954
f = _mm256_load_si256(&a[i]);
5055

51-
/* check-magic: 4092 == 2 * ((MLDSA_Q-1) // 32) // 128 */
5256
/*
5357
* The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
5458
* alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -68,7 +72,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
6872
* _mm256_mulhi_epu16() below.
6973
*/
7074

71-
/* check-magic: off */
7275
/*
7376
* Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
7477
* for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
@@ -77,7 +80,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
7780
* The odd-index 16-bit lanes are still all 0 after this. As such, despite
7881
* that the following steps use 32-bit lanes, the value of f1 is unaffected.
7982
*/
80-
/* check-magic: on */
8183
f1 = _mm256_mulhi_epu16(f1, v);
8284
f1 = _mm256_mulhrs_epi16(f1, shift);
8385
/* range: 0 <= f1 <= 16 */
@@ -86,6 +88,18 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
8688
* If f1 = 16, i.e. f > 31*GAMMA2, proceed as if f' = f - Q was given
8789
* instead. (For f = 31*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0
8890
* like other "wrapped around" cases.)
91+
*
92+
* Reference: They handle wrap-around in a somewhat convoluted way. Most
93+
* notably, they compute remainder f0 with quotient f1 that's
94+
* already wrapped around, so is off by q (instead of by 1) from
95+
* what it should be ultimately. They detect the need for
96+
* correction by checking if f0 is abnormally large.
97+
*
98+
* Our approach is closer to Algorithm 36 in the specification,
99+
* in that we compute f0 normally and correct f1, f0 in the way
100+
* they prescribed. The only real difference is that we check for
101+
* wrap-around by examining f directly, instead of some other
102+
* intermidiates computed from it.
89103
*/
90104

91105
/* Check for wrap-around */

mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,12 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
3939
unsigned int i;
4040
__m256i f, f0, f1, t;
4141
const __m256i q_bound = _mm256_set1_epi32(87 * MLDSA_GAMMA2);
42-
/* check-magic: 11275 == floor(2**24 / 1488) */
42+
/*
43+
* Barrett constant 11275 = floor(2**24 / 1488), for computing the division
44+
* a -> round-(a / 1488). While it doesn't make a difference here, using
45+
* floor() instead of round() is how we make sure 11275 / 2^24 ≲ 1 / 1488
46+
* instead of ≳. See below for more details.
47+
*/
4348
const __m256i v = _mm256_set1_epi32(11275);
4449
const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
4550
const __m256i off = _mm256_set1_epi32(127);
@@ -49,7 +54,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
4954
{
5055
f = _mm256_load_si256(&a[i]);
5156

52-
/* check-magic: 1488 == 2 * ((MLDSA_Q-1) // 88) // 128 */
5357
/*
5458
* The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
5559
* alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -69,7 +73,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
6973
* _mm256_mulhi_epu16() below.
7074
*/
7175

72-
/* check-magic: off */
7376
/*
7477
* Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
7578
* for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
@@ -78,7 +81,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
7881
* The odd-index 16-bit lanes are still all 0 after this. As such, despite
7982
* that the following steps use 32-bit lanes, the value of f1 is unaffected.
8083
*/
81-
/* check-magic: on */
8284
f1 = _mm256_mulhi_epu16(f1, v);
8385
f1 = _mm256_mulhrs_epi16(f1, shift);
8486
/* range: 0 <= f1 <= 44 */
@@ -87,6 +89,18 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
8789
* If f1 = 44, i.e. f > 87*GAMMA2, proceed as if f' = f - Q was given
8890
* instead. (For f = 87*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0
8991
* like other "wrapped around" cases.)
92+
*
93+
* Reference: They handle wrap-around in a somewhat convoluted way. Most
94+
* notably, they compute remainder f0 with quotient f1 that's
95+
* already wrapped around, so is off by q (instead of by 1) from
96+
* what it should be ultimately. They detect the need for
97+
* correction by checking if f0 is abnormally large.
98+
*
99+
* Our approach is closer to Algorithm 36 in the specification,
100+
* in that we compute f0 normally and correct f1, f0 in the way
101+
* they prescribed. The only real difference is that we check for
102+
* wrap-around by examining f directly, instead of some other
103+
* intermidiates computed from it.
90104
*/
91105

92106
/* Check for wrap-around */

scripts/check-magic

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,13 @@ def get_files(pattern):
2323

2424
def check_magic_numbers():
2525
mldsa_q = 8380417
26-
exceptions = [mldsa_q]
26+
exceptions = [
27+
mldsa_q,
28+
4092, # 2*GAMMA2/128 for GAMMA2 = (q-1)/32
29+
1025, # Barrett constant for computing a -> round-(a / 4092)
30+
1488, # 2*GAMMA2/128 for GAMMA2 = (q-1)/88
31+
11275, # Barrett constant for computing a -> round-(a / 1488)
32+
]
2733
enable_marker = "check-magic: on"
2834
disable_marker = "check-magic: off"
2935
autogen_marker = "This file is auto-generated from scripts/autogen"

0 commit comments

Comments
 (0)