Add 2*GAMMA2/128 (and the corresponding Barrett constants) as exceptions for check-magic

jammychiou1 · jammychiou1 · commit 5082756d1484 · 2025-11-08T16:10:31.000+08:00
Signed-off-by: jammychiou1 &lt;jammy.chiou1@gmail.com&gt;
diff --git a/dev/x86_64/src/poly_decompose_32_avx2.c b/dev/x86_64/src/poly_decompose_32_avx2.c
@@ -38,7 +38,12 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   unsigned int i;
   __m256i f, f0, f1, t;
   const __m256i q_bound = _mm256_set1_epi32(31 * MLDSA_GAMMA2);
-  /* check-magic: 1025 == floor(2**22 / 4092) */
+  /*
+   * Barrett constant 1025 = floor(2**22 / 4092), for computing the division
+   * a -> round-(a / 4092). While it doesn't make a difference here, using
+   * floor() instead of round() is how we make sure 1025 / 2^22 ≲ 1 / 4092
+   * instead of ≳. See below for more details.
+   */
   const __m256i v = _mm256_set1_epi32(1025);
   const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
   const __m256i off = _mm256_set1_epi32(127);
@@ -48,7 +53,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   {
     f = _mm256_load_si256(&a[i]);
 
-    /* check-magic: 4092 == 2 * ((MLDSA_Q-1) // 32) // 128 */
     /*
      * The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
      * alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -68,7 +72,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
-    /* check-magic: off */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
@@ -77,7 +80,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * The odd-index 16-bit lanes are still all 0 after this. As such, despite
      * that the following steps use 32-bit lanes, the value of f1 is unaffected.
      */
-    /* check-magic: on */
     f1 = _mm256_mulhi_epu16(f1, v);
     f1 = _mm256_mulhrs_epi16(f1, shift);
     /* range: 0 <= f1 <= 16 */
diff --git a/dev/x86_64/src/poly_decompose_88_avx2.c b/dev/x86_64/src/poly_decompose_88_avx2.c
@@ -39,7 +39,12 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   unsigned int i;
   __m256i f, f0, f1, t;
   const __m256i q_bound = _mm256_set1_epi32(87 * MLDSA_GAMMA2);
-  /* check-magic: 11275 == floor(2**24 / 1488) */
+  /*
+   * Barrett constant 11275 = floor(2**24 / 1488), for computing the division
+   * a -> round-(a / 1488). While it doesn't make a difference here, using
+   * floor() instead of round() is how we make sure 11275 / 2^24 ≲ 1 / 1488
+   * instead of ≳. See below for more details.
+   */
   const __m256i v = _mm256_set1_epi32(11275);
   const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
   const __m256i off = _mm256_set1_epi32(127);
@@ -49,7 +54,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   {
     f = _mm256_load_si256(&a[i]);
 
-    /* check-magic: 1488 == 2 * ((MLDSA_Q-1) // 88) // 128 */
     /*
      * The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
      * alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -69,7 +73,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
-    /* check-magic: off */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
@@ -78,7 +81,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * The odd-index 16-bit lanes are still all 0 after this. As such, despite
      * that the following steps use 32-bit lanes, the value of f1 is unaffected.
      */
-    /* check-magic: on */
     f1 = _mm256_mulhi_epu16(f1, v);
     f1 = _mm256_mulhrs_epi16(f1, shift);
     /* range: 0 <= f1 <= 44 */
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_32_avx2.c
@@ -38,7 +38,12 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   unsigned int i;
   __m256i f, f0, f1, t;
   const __m256i q_bound = _mm256_set1_epi32(31 * MLDSA_GAMMA2);
-  /* check-magic: 1025 == floor(2**22 / 4092) */
+  /*
+   * Barrett constant 1025 = floor(2**22 / 4092), for computing the division
+   * a -> round-(a / 4092). While it doesn't make a difference here, using
+   * floor() instead of round() is how we make sure 1025 / 2^22 ≲ 1 / 4092
+   * instead of ≳. See below for more details.
+   */
   const __m256i v = _mm256_set1_epi32(1025);
   const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
   const __m256i off = _mm256_set1_epi32(127);
@@ -48,7 +53,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   {
     f = _mm256_load_si256(&a[i]);
 
-    /* check-magic: 4092 == 2 * ((MLDSA_Q-1) // 32) // 128 */
     /*
      * The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
      * alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -68,7 +72,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
-    /* check-magic: off */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 1025 / 2^22). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 1025 / 2^22 ≲
@@ -77,7 +80,6 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * The odd-index 16-bit lanes are still all 0 after this. As such, despite
      * that the following steps use 32-bit lanes, the value of f1 is unaffected.
      */
-    /* check-magic: on */
     f1 = _mm256_mulhi_epu16(f1, v);
     f1 = _mm256_mulhrs_epi16(f1, shift);
     /* range: 0 <= f1 <= 16 */
@@ -86,6 +88,18 @@ void mld_poly_decompose_32_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * If f1 = 16, i.e. f > 31*GAMMA2, proceed as if f' = f - Q was given
      * instead. (For f = 31*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0
      * like other "wrapped around" cases.)
+     *
+     * Reference: They handle wrap-around in a somewhat convoluted way. Most
+     *            notably, they compute remainder f0 with quotient f1 that's
+     *            already wrapped around, so is off by q (instead of by 1) from
+     *            what it should be ultimately. They detect the need for
+     *            correction by checking if f0 is abnormally large.
+     *
+     *            Our approach is closer to Algorithm 36 in the specification,
+     *            in that we compute f0 normally and correct f1, f0 in the way
+     *            they prescribed. The only real difference is that we check for
+     *            wrap-around by examining f directly, instead of some other
+     *            intermidiates computed from it.
      */
 
     /* Check for wrap-around */
diff --git a/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c b/mldsa/src/native/x86_64/src/poly_decompose_88_avx2.c
@@ -39,7 +39,12 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   unsigned int i;
   __m256i f, f0, f1, t;
   const __m256i q_bound = _mm256_set1_epi32(87 * MLDSA_GAMMA2);
-  /* check-magic: 11275 == floor(2**24 / 1488) */
+  /*
+   * Barrett constant 11275 = floor(2**24 / 1488), for computing the division
+   * a -> round-(a / 1488). While it doesn't make a difference here, using
+   * floor() instead of round() is how we make sure 11275 / 2^24 ≲ 1 / 1488
+   * instead of ≳. See below for more details.
+   */
   const __m256i v = _mm256_set1_epi32(11275);
   const __m256i alpha = _mm256_set1_epi32(2 * MLDSA_GAMMA2);
   const __m256i off = _mm256_set1_epi32(127);
@@ -49,7 +54,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
   {
     f = _mm256_load_si256(&a[i]);
 
-    /* check-magic: 1488 == 2 * ((MLDSA_Q-1) // 88) // 128 */
     /*
      * The goal is to compute f1 = round-(f / (2*GAMMA2)), which can be computed
      * alternatively as round-(f / (128B)) = round-(ceil(f / 128) / B) where
@@ -69,7 +73,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * _mm256_mulhi_epu16() below.
      */
 
-    /* check-magic: off */
     /*
      * Compute f1 = round-(f1' / B) ≈ round(f1' * 11275 / 2^24). This is exact
      * for 0 <= f1' < 2^16. Note that half is rounded down since 11275 / 2^24 ≲
@@ -78,7 +81,6 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * The odd-index 16-bit lanes are still all 0 after this. As such, despite
      * that the following steps use 32-bit lanes, the value of f1 is unaffected.
      */
-    /* check-magic: on */
     f1 = _mm256_mulhi_epu16(f1, v);
     f1 = _mm256_mulhrs_epi16(f1, shift);
     /* range: 0 <= f1 <= 44 */
@@ -87,6 +89,18 @@ void mld_poly_decompose_88_avx2(__m256i *a1, __m256i *a0, const __m256i *a)
      * If f1 = 44, i.e. f > 87*GAMMA2, proceed as if f' = f - Q was given
      * instead. (For f = 87*GAMMA2 + 1 thus f' = -GAMMA2, we still round it to 0
      * like other "wrapped around" cases.)
+     *
+     * Reference: They handle wrap-around in a somewhat convoluted way. Most
+     *            notably, they compute remainder f0 with quotient f1 that's
+     *            already wrapped around, so is off by q (instead of by 1) from
+     *            what it should be ultimately. They detect the need for
+     *            correction by checking if f0 is abnormally large.
+     *
+     *            Our approach is closer to Algorithm 36 in the specification,
+     *            in that we compute f0 normally and correct f1, f0 in the way
+     *            they prescribed. The only real difference is that we check for
+     *            wrap-around by examining f directly, instead of some other
+     *            intermidiates computed from it.
      */
 
     /* Check for wrap-around */
diff --git a/scripts/check-magic b/scripts/check-magic
@@ -23,7 +23,13 @@ def get_files(pattern):
 
 def check_magic_numbers():
     mldsa_q = 8380417
-    exceptions = [mldsa_q]
+    exceptions = [
+            mldsa_q,
+            4092,  # 2*GAMMA2/128 for GAMMA2 = (q-1)/32
+            1025,  # Barrett constant for computing a -> round-(a / 4092)
+            1488,  # 2*GAMMA2/128 for GAMMA2 = (q-1)/88
+            11275, # Barrett constant for computing a -> round-(a / 1488)
+    ]
     enable_marker = "check-magic: on"
     disable_marker = "check-magic: off"
     autogen_marker = "This file is auto-generated from scripts/autogen"