WIP: Schoenhage-Strassen debugging

tautschnig · tautschnig · commit 1c60d3ab9168 · 2025-10-13T22:06:42.000+02:00
diff --git a/src/solvers/flattening/bv_utils.cpp b/src/solvers/flattening/bv_utils.cpp
@@ -1000,11 +1000,12 @@ bvt bv_utilst::comba_column_wise(const std::vector<bvt> &pps)
 // #define RADIX_MULTIPLIER 8
 // #define USE_KARATSUBA
 // #define USE_TOOM_COOK
-// #define USE_SCHOENHAGE_STRASSEN
+#define USE_SCHOENHAGE_STRASSEN
 #ifdef RADIX_MULTIPLIER
+//#  define COMBA
 #  define DADDA_TREE
 #endif
-#define COMBA
+// #define COMBA
 
 #ifdef RADIX_MULTIPLIER
 static bvt unsigned_multiply_by_3(propt &prop, const bvt &op)
@@ -2183,12 +2184,29 @@ bvt bv_utilst::unsigned_toom_cook_multiplier(const bvt &_op0, const bvt &_op1)
 }
 
 bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
-  const bvt &a,
-  const bvt &b)
+  const bvt &_a,
+  const bvt &_b)
 {
+  // http://malte-leip.net/beschreibung_ssa.pdf,
+  // https://de.wikipedia.org/wiki/Sch%C3%B6nhage-Strassen-Algorithmus
+  // Isabelle proof: https://mediatum.ub.tum.de/doc/1717658/1717658.pdf
+  bvt a = _a;
+#if 1
+  // bvt b = a;
+  bvt b = _b;
+#else
+  bvt b = _b;
+  a.resize(14);
+  b.resize(14);
+#endif
+
   PRECONDITION(a.size() == b.size());
+  const std::size_t op_size = a.size();
+  // delta.size() <= result_size doesn't hold for op_size <= 3
+  if(op_size <= 3)
+    return unsigned_multiplier(a, b);
 
-  // Running examples: we want to multiple 213 by 15 as 8- or 9-bit integers.
+  // Running example: we want to multiply 213 by 15 as 8- or 9-bit integers.
   // That is, we seek to multiply 11010101 (011010101) by 00001111 (000001111).
   //                              ^bit 7 ^bit 0
   // The expected result is 123 as both an 8-bit and 9-bit result (001111011).
@@ -2202,7 +2220,7 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   // m >= log_2(op_size) + 1.
   // For our examples m will be 4 and 5, respectively, with Fermat numbers
   // 2^16 + 1 and 2^32 + 1.
-  const std::size_t m = address_bits(a.size()) + 1;
+  const std::size_t m = address_bits(op_size) + 1;
   std::cerr << "m: " << m << std::endl;
 
   // Extend bit width to 2^(m + 1) = op_size (rounded to next power of 2) * 4
@@ -2243,8 +2261,11 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   {
     a_rho.emplace_back(
       a_ext.begin() + i * chunk_size, a_ext.begin() + (i + 1) * chunk_size);
+    std::cerr << "a_rho[" << i << "]: " << beautify(a_rho.back()) << std::endl;
     b_sigma.emplace_back(
       b_ext.begin() + i * chunk_size, b_ext.begin() + (i + 1) * chunk_size);
+    std::cerr << "b_sigma[" << i << "]: " << beautify(b_sigma.back())
+              << std::endl;
   }
   // For our example we now have
   // a_rho = [ 0101, 1101, 0000, ..., 0000 ]
@@ -2266,11 +2287,16 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
         ++rho)
     {
       const std::size_t sigma = tau - rho;
-      gamma_tau[tau] = add(
-        gamma_tau[tau],
+      std::cerr << "Inner multiplication a_" << rho << " * b_" << sigma;
+      auto inner_product = zero_extension(
         unsigned_multiplier(
-          zero_extension(a_rho[rho], 3 * n + 5),
-          zero_extension(b_sigma[sigma], 3 * n + 5)));
+          zero_extension(a_rho[rho], chunk_size * 2),
+          zero_extension(b_sigma[sigma], chunk_size * 2)),
+        3 * n + 5);
+      std::cerr << " = " << beautify(inner_product) << std::endl;
+      gamma_tau[tau] = add(gamma_tau[tau], inner_product);
+      std::cerr << "gamma_tau[" << tau << "] = " << beautify(gamma_tau[tau])
+                << std::endl;
     }
   }
   // For our example we obtain
@@ -2282,6 +2308,7 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   c_tau.reserve(num_chunks);
   for(std::size_t tau = 0; tau < num_chunks; ++tau)
   {
+    // std::cerr << "gamma_tau[" << tau << "]: " << beautify(gamma_tau[tau]) << std::endl;
     c_tau.push_back(add(gamma_tau[tau], gamma_tau[tau + num_chunks]));
     CHECK_RETURN(c_tau.back().size() >= address_bits(num_chunks) + 1);
     c_tau.back().resize(address_bits(num_chunks) + 1);
@@ -2295,7 +2322,11 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   std::vector<bvt> z_j;
   z_j.reserve(num_chunks / 2);
   for(std::size_t j = 0; j < num_chunks / 2; ++j)
+  {
     z_j.push_back(sub(c_tau[j], c_tau[j + num_chunks / 2]));
+    // z_j.back().resize(address_bits(num_chunks) + 1);
+    std::cerr << "z_" << j << " = " << beautify(z_j.back()) << std::endl;
+  }
   // For our example we have z_j = c_tau as all elements beyond the second one
   // are zeros.
 
@@ -2325,14 +2356,14 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
   // inverse NTT.
 
   // Addition mod F_n with overflow
-  auto cyclic_add = [this](const bvt &x, const bvt &y)
-  {
+  auto cyclic_add = [this](const bvt &x, const bvt &y) {
     PRECONDITION(x.size() == y.size());
 
     auto result_with_overflow = adder(x, y, const_literal(false));
     if(result_with_overflow.second.is_false())
       return result_with_overflow.first;
 
+    std::cerr << "OVERFLOW" << std::endl;
     return add(
       result_with_overflow.first,
       zero_extension(bvt{1, result_with_overflow.second}, x.size()));
@@ -2365,14 +2396,16 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
       j <<= 1; // the initial shift has no effect
       j |= (k & (1 << nu)) >> nu;
     }
+    std::cerr << "k=" << k << " yields j=" << j << std::endl;
     Aa.push_back(a_j[j]);
     Ab.push_back(b_j[j]);
+    std::cerr << "Aa[0](" << k << "): " << beautify(Aa.back()) << std::endl;
   }
-  for(std::size_t nu = 1; nu <= address_bits(num_chunks); ++nu)
+  for(std::size_t nu = 0; nu < address_bits(num_chunks); ++nu)
   {
-    const std::size_t bit_nu = (std::size_t)1 << (nu - 1);
+    const std::size_t bit_nu = (std::size_t)1 << nu;
     std::size_t bits_up_to_nu = 0;
-    for(std::size_t i = 0; i < nu - 1; ++i)
+    for(std::size_t i = 0; i < nu; ++i)
       bits_up_to_nu |= 1 << i;
 
     // we only need odd ones
@@ -2384,21 +2417,26 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
       bvt Aa_nu_bit_is_zero = Aa[k & ~bit_nu];
       bvt Ab_nu_bit_is_zero = Ab[k & ~bit_nu];
 
+      std::cerr << "Round " << (nu + 1) << ", k=" << k
+                << ", k & ~bit_nu=" << (k & ~bit_nu) << std::endl;
       const std::size_t chi = (k & bits_up_to_nu)
-                              << (address_bits(num_chunks) - 1 - (nu - 1));
+                              << (address_bits(num_chunks) - 1 - nu);
+      std::cerr << "k & bits_up_to_nu=" << (k & bits_up_to_nu)
+                << ", chi=" << chi << std::endl;
       const std::size_t omega = m % 2 == 1 ? 2 : 4;
       const std::size_t shift_dist = chi * omega / 2;
 
-      if(nu > 1) // no need to update even indices
+      if(nu > 0) // no need to update even indices
       {
         Aa[k & ~bit_nu] = cyclic_add(
           Aa_nu_bit_is_zero, shift(Aa[k], shiftt::ROTATE_LEFT, shift_dist));
         Ab[k & ~bit_nu] = cyclic_add(
           Ab_nu_bit_is_zero, shift(Ab[k], shiftt::ROTATE_LEFT, shift_dist));
-        std::cerr << "Aa[" << nu << "](" << (k & ~bit_nu)
+        std::cerr << "shift_dist: " << shift_dist << std::endl;
+        std::cerr << "Aa[" << (nu + 1) << "](" << (k & ~bit_nu)
                   << "): " << beautify(Aa[k & ~bit_nu]) << std::endl;
 #if 0
-        std::cerr << "Ab[" << nu << "](" << (k & ~bit_nu)
+        std::cerr << "Ab[" << (nu + 1) << "](" << (k & ~bit_nu)
                   << "): " << beautify(Ab[k & ~bit_nu]) << std::endl;
 #endif
       }
@@ -2412,19 +2450,20 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
       Ab[k] = cyclic_add(
         Ab_nu_bit_is_zero,
         shift(Ab[k], shiftt::ROTATE_LEFT, shift_dist_for_sub));
-      std::cerr << "Aa[" << nu << "](" << k << "): " << beautify(Aa[k])
+      std::cerr << "shift_dist_for_sub: " << shift_dist_for_sub << std::endl;
+      std::cerr << "Aa[" << (nu + 1) << "](" << k << "): " << beautify(Aa[k])
                 << std::endl;
 #if 0
-      std::cerr << "Ab[" << nu << "](" << k << "): " << beautify(Ab[k])
+      std::cerr << "Ab[" << (nu + 1) << "](" << k << "): " << beautify(Ab[k])
                 << std::endl;
 #endif
     }
   }
 
   // Either compute u - v (if u > v), else u - v + 2^2^n + 1
-  auto reduce_to_mod_F_n = [this](const bvt &x)
-  {
+  auto reduce_to_mod_F_n = [this](const bvt &x, std::size_t n) {
     const std::size_t two_to_power_of_n = x.size() / 2;
+    PRECONDITION(two_to_power_of_n == (std::size_t)1 << n);
     // std::cerr << "two_to_power_of_n: " << two_to_power_of_n << std::endl;
     const bvt u =
       zero_extension(bvt{x.begin(), x.begin() + two_to_power_of_n}, x.size());
@@ -2445,54 +2484,64 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
 
   std::vector<bvt> a_hat_k{num_chunks, bvt{}}, b_hat_k{num_chunks, bvt{}};
   // Reduce by F_n
-  for(std::size_t j = 1; j < num_chunks; j += 2)
+  for(std::size_t k = 1; k < num_chunks; k += 2)
   {
-    a_hat_k[j] = reduce_to_mod_F_n(Aa[j]);
-    std::cerr << "a_hat_k[" << j << "]: " << beautify(a_hat_k[j]) << std::endl;
-    b_hat_k[j] = reduce_to_mod_F_n(Ab[j]);
-    std::cerr << "b_hat_k[" << j << "]: " << beautify(b_hat_k[j]) << std::endl;
+    a_hat_k[k] = reduce_to_mod_F_n(Aa[k], n);
+    std::cerr << "a_hat_k[" << k << "]: " << beautify(a_hat_k[k]) << std::endl;
+    b_hat_k[k] = reduce_to_mod_F_n(Ab[k], n);
+    std::cerr << "b_hat_k[" << k << "]: " << beautify(b_hat_k[k]) << std::endl;
   }
 
   // Compute point-wise multiplication
   std::vector<bvt> c_hat_k{num_chunks, bvt{}};
-  for(std::size_t j = 1; j < num_chunks; j += 2)
-  {
-    c_hat_k[j] = unsigned_multiplier(a_hat_k[j], b_hat_k[j]);
-    std::cerr << "c_hat_k[" << j << "]: " << beautify(c_hat_k[j]) << std::endl;
+  for(std::size_t k = 1; k < num_chunks; k += 2)
+  {
+    // If at least one of a_hat_k[k] or b_hat_k[k] is 2^2^n (i.e., F_n - 1) then
+    // multiplication would overflow, so handle those cases separately:
+    // x * 2^2^n = x * -1 (mod F_n) which can be computed by rotating x by 2^n
+    const std::size_t power_2_n = (std::size_t)1 << n;
+    c_hat_k[k] = select(
+      a_hat_k[k][power_2_n],
+      shift(b_hat_k[k], shiftt::ROTATE_LEFT, power_2_n),
+      select(
+        b_hat_k[k][power_2_n],
+        shift(a_hat_k[k], shiftt::ROTATE_LEFT, power_2_n),
+        unsigned_multiplier(a_hat_k[k], b_hat_k[k])));
+    std::cerr << "c_hat_k[" << k << "]: " << beautify(c_hat_k[k]) << std::endl;
   }
 
   // Apply inverse NTT
   for(std::size_t nu = address_bits(num_chunks) - 1; nu > 0; --nu)
   {
-    const std::size_t bit_nu_plus_1 = (std::size_t)1 << nu;
-    std::size_t bits_up_to_nu_plus_1 = 0;
+    const std::size_t bit_nu = (std::size_t)1 << nu;
+    std::size_t bits_up_to_nu = 0;
     for(std::size_t i = 0; i < nu; ++i)
-      bits_up_to_nu_plus_1 |= 1 << i;
+      bits_up_to_nu |= 1 << i;
 
     // we only need odd ones
     for(std::size_t k = 1; k < num_chunks; k += 2)
     {
-      if((k & bit_nu_plus_1) == 0)
+      if((k & bit_nu) == 0)
         continue;
 
-      bvt c_hat_k_nu_plus_1_bit_is_zero = c_hat_k[k & ~bit_nu_plus_1];
+      bvt c_hat_k_nu_bit_is_zero = c_hat_k[k & ~bit_nu];
 
-      c_hat_k[k & ~bit_nu_plus_1] = shift(
-        cyclic_add(c_hat_k_nu_plus_1_bit_is_zero, c_hat_k[k]),
+      c_hat_k[k & ~bit_nu] = shift(
+        cyclic_add(c_hat_k_nu_bit_is_zero, c_hat_k[k]),
         shiftt::ROTATE_RIGHT,
         1);
-      std::cerr << "c_hat_k[" << nu << "](" << (k & ~bit_nu_plus_1)
-                << "): " << beautify(c_hat_k[k & ~bit_nu_plus_1]) << std::endl;
+      std::cerr << "c_hat_k[" << nu << "](" << (k & ~bit_nu)
+                << "): " << beautify(c_hat_k[k & ~bit_nu]) << std::endl;
 
-      const std::size_t chi = (k & bits_up_to_nu_plus_1)
+      const std::size_t chi = (k & bits_up_to_nu)
                               << (address_bits(num_chunks) - 1 - nu);
       const std::size_t omega = m % 2 == 1 ? 2 : 4;
       const std::size_t shift_dist = chi * omega / 2 + 1;
       std::cerr << "SHIFT: " << shift_dist << std::endl;
 
       c_hat_k[k] = shift(
         cyclic_add(
-          c_hat_k_nu_plus_1_bit_is_zero,
+          c_hat_k_nu_bit_is_zero,
           shift(c_hat_k[k], shiftt::ROTATE_LEFT, (std::size_t)1 << n)),
         shiftt::ROTATE_RIGHT,
         shift_dist);
@@ -2514,45 +2563,63 @@ bvt bv_utilst::unsigned_schoenhage_strassen_multiplier(
     }
     k |= 1;
     std::cerr << "j " << j << " maps to " << k << std::endl;
-    z_j_mod_F_n.push_back(reduce_to_mod_F_n(c_hat_k[k]));
+    // TODO: we could add a capability to reduce_to_mod_F_n to restrict the
+    // result to op_size bits so that this call (but not the ones above) could
+    // use it.
+    z_j_mod_F_n.push_back(reduce_to_mod_F_n(c_hat_k[k], n));
     std::cerr << "z_j_mod_F_n[" << j << "]: " << beautify(z_j_mod_F_n[j])
               << std::endl;
   }
 
-  // Compute final coefficients as eta + delta * F_n where delta = eta - xi for
-  // eta z_j and xi c_hat_k.
+  // Compute final coefficients as xi + delta * F_n where delta = eta - xi (mod
+  // 2^(n + 2) for odd m and 2^(n + 1) for even m) for eta being z_j and xi
+  // being z_j_mod_F_n.
+#if 0
+  // To compute the full-width result
+  const std::size_t result_size = two_to_m_plus_1;
+#else
+  const std::size_t result_size = op_size;
+#endif
   for(std::size_t j = 0; j < num_chunks / 2; ++j)
   {
-    bvt eta = z_j_mod_F_n[j];
+    const bvt &eta = z_j[j];
     std::cerr << "eta[" << j << "]: " << beautify(eta) << std::endl;
-    bvt xi = z_j[j];
+    const bvt &xi = z_j_mod_F_n[j];
     std::cerr << "xi[" << j << "]: " << beautify(xi) << std::endl;
-    // TODO: couldn't we do this over just xi.size() bits instead?
-    bvt delta = sub(eta, zero_extension(xi, eta.size()));
-    CHECK_RETURN(delta.size() >= xi.size());
-    delta.resize(xi.size());
+    PRECONDITION(eta.size() == address_bits(num_chunks) + 1);
+    bvt xi_2n_2{xi.begin(), xi.begin() + eta.size()};
+    bvt delta = sub(eta, xi_2n_2);
+    PRECONDITION(delta.size() <= result_size);
     std::cerr << "delta[" << j << "]: " << beautify(delta) << std::endl;
-    z_j[j] = add(
-      zero_extension(eta, two_to_m_plus_1),
-      add(
-        shift(
-          zero_extension(delta, two_to_m_plus_1),
-          shiftt::SHIFT_LEFT,
-          (std::size_t)1 << n),
-        zero_extension(delta, two_to_m_plus_1)));
+    bvt delta_times_F_n = add(
+      shift(
+        zero_extension(delta, result_size),
+        shiftt::SHIFT_LEFT,
+        (std::size_t)1 << n),
+      zero_extension(delta, result_size));
+    std::cerr << "delta * F_n: " << beautify(delta_times_F_n) << std::endl;
+    if(xi.size() > result_size)
+    {
+      bvt xi_result_size{xi.begin(), xi.begin() + result_size};
+      z_j[j] = add(xi_result_size, delta_times_F_n);
+    }
+    else
+    {
+      z_j[j] = add(zero_extension(xi, result_size), delta_times_F_n);
+    }
     std::cerr << "z_j[" << j << "]: " << beautify(z_j[j]) << std::endl;
   }
 
-  bvt result = zeros(two_to_m_plus_1);
+  bvt result = zeros(result_size);
   for(std::size_t j = 0; j < num_chunks / 2; ++j)
   {
-    if(chunk_size * j >= a.size())
+    if(chunk_size * j >= result_size)
       break;
     result = add(result, shift(z_j[j], shiftt::SHIFT_LEFT, chunk_size * j));
   }
   std::cerr << "result: " << beautify(result) << std::endl;
-  CHECK_RETURN(result.size() >= a.size());
-  result.resize(a.size());
+  CHECK_RETURN(result.size() >= op_size);
+  result.resize(op_size);
   std::cerr << "result resized: " << beautify(result) << std::endl;
 
   return result;