preda · preda · Dec 13, 2025 · Apr 7, 2025 · Apr 7, 2025 · Apr 9, 2025
diff --git a/Makefile b/Makefile
@@ -19,8 +19,12 @@ else
 CXX = g++
 endif
 
-COMMON_FLAGS = -Wall -std=c++20
-# -static-libstdc++ -static-libgcc
+ifneq ($(findstring MINGW, $(HOST_OS)), MINGW)
+COMMON_FLAGS = -Wall -std=c++20 -static-libstdc++ -static-libgcc
+else
+# For mingw-64 use this:
+COMMON_FLAGS = -Wall -std=c++20 -static-libstdc++ -static-libgcc -static
+endif
 # -fext-numeric-literals
 
 ifeq ($(HOST_OS), Darwin)

diff --git a/src/Args.cpp b/src/Args.cpp
@@ -156,6 +156,7 @@ named "config.txt" in the prpll run directory.
 -prp <exponent>    : run a single PRP test and exit, ignoring worktodo.txt
 -ll <exponent>     : run a single LL test and exit, ignoring worktodo.txt
 -verify <file>     : verify PRP-proof contained in <file>
+-smallest          : work on smallest exponent in worktodo.txt rather than the first exponent in worktodo.txt    
 -proof <power>     : generate proof of power <power> (default: optimal depending on exponent).
                      A lower power reduces disk space requirements but increases the verification cost.
                      A higher power increases disk usage a lot.
@@ -185,6 +186,8 @@ named "config.txt" in the prpll run directory.
                      2 = calculate from scratch, no memory read
                      1 = calculate using one complex multiply from cached memory and uncached memory
                      0 = read trig values from memory
+  -use INPLACE=n   : Perform tranforms in-place.  Great if the reduced memory usage fits in the GPU's L2 cache.
+                     0 = not in-place, 1 = nVidia friendly access pattern, 2 = AMD friendly access pattern.
   -use PAD=<val>   : insert pad bytes to possibly improve memory access patterns.  Val is number bytes to pad.
   -use MIDDLE_IN_LDS_TRANSPOSE=0|1  : Transpose values in local memory before writing to global memory
   -use MIDDLE_OUT_LDS_TRANSPOSE=0|1 : Transpose values in local memory before writing to global memory
@@ -194,23 +197,14 @@ named "config.txt" in the prpll run directory.
 
   -use DEBUG       : enable asserts in OpenCL kernels (slow, developers)
 
--tune              : measures the speed of the FFTs specified in -fft <spec> to find the best FFT for each exponent.
-
--ctune <configs>   : finds the best configuration for each FFT specified in -fft <spec>.
-                     Prints the results in a form that can be incorporated in config.txt
-                      -fft 6.5M  -ctune "OUT_SIZEX=32,8;OUT_WG=64,128,256"
-
-                     It is possible to specify -ctune multiple times on the same command in order to define multiple
-                     sets of parameters to be combined, e.g.:
-                        -ctune "IN_WG=256,128,64" -ctune "OUT_WG=256,64;OUT_SIZEX=32,16,8"
-                     which would try only 8 combinations among those two sets.
-
-                     The tunable parameters (with the default value emphasized) are:
-                       IN_WG, OUT_WG: 64, 128, *256*
-                       IN_SIZEX, OUT_SIZEX: 4, 8, 16, *32*
-                       UNROLL_W: *0*, 1
-                       UNROLL_H: 0, 1
-
+-tune <options>    : Looks for best settings to include in config.txt.  Times many FFTs to find fastest one to test exponents -- written to tune.txt.
+                     An -fft <spec> can be given on the command line to limit which FFTs are timed.
+                     Options are not required.  If present, the options are a comma separated list from below.
+                         noconfig     - Skip timings to find best config.txt settings
+                         fp64         - Tune for settings that affect FP64 FFTs.  Time FP64 FFTs for tune.txt.
+                         ntt          - Tune for settings that affect integer NTTs.  Time integer NTTs for tune.txt.
+                         minexp=<val> - Time FFTs to find the best one for exponents greater than <val>.
+                         maxexp=<val> - Time FFTs to find the best one for exponents less than <val>.
 -device <N>        : select the GPU at position N in the list of devices
 -uid    <UID>      : select the GPU with the given UID (on ROCm/AMDGPU, Linux)
 -pci    <BDF>      : select the GPU with the given PCI BDF, e.g. "0c:00.0"
@@ -236,31 +230,34 @@ Device selection : use one of -uid <UID>, -pci <BDF>, -device <N>, see the list
            );
 
   }
-  printf("\nFFT Configurations (specify with -fft <width>:<middle>:<height> from the set below):\n"
+  printf("\nFFT Configurations (specify with -fft <type>:<width>:<middle>:<height> from the set below):\n"
          " Size   MaxExp   BPW    FFT\n");
-  
+
   vector<FFTShape> configs = FFTShape::allShapes();
   configs.push_back(configs.front()); // dummy guard for the loop below.
-  string variants;
   u32 activeSize = 0;
-  double maxBpw = 0;
-  for (auto c : configs) {
-    if (c.size() != activeSize) {
-      if (!variants.empty()) {
-        printf("%5s  %7.2fM  %.2f  %s\n",
-               numberK(activeSize).c_str(),
-               // activeSize * FFTShape::MIN_BPW / 1'000'000,
-               activeSize * maxBpw / 1'000'000.0,
-               maxBpw,
-               variants.c_str());
-        variants.clear();
+  float maxBpw = 0;
+  string variants;
+  for (enum FFT_TYPES type : {FFT64, FFT3161, FFT3261, FFT61}) {
+    for (auto c : configs) {
+      if (c.fft_type != type) continue;
+      if (c.size() != activeSize) {
+        if (!variants.empty()) {
+          printf("%5s  %7.2fM  %.2f  %s\n",
+                 numberK(activeSize).c_str(),
+                 // activeSize * FFTShape::MIN_BPW / 1'000'000,
+                 activeSize * maxBpw / 1'000'000.0,
+                 maxBpw,
+                 variants.c_str());
+          variants.clear();
+        }
+        activeSize = c.size();
+        maxBpw = 0;
       }
-      activeSize = c.size();
-      maxBpw = 0;
+      maxBpw = max(maxBpw, c.maxBpw());
+      if (!variants.empty()) { variants.push_back(','); }
+      variants += c.spec();
     }
-    maxBpw = max(maxBpw, c.maxBpw());
-    if (!variants.empty()) { variants.push_back(','); }
-    variants += c.spec();
   }
 }
 
@@ -295,9 +292,10 @@ void Args::parse(const string& line) {
         log("-info expects an FFT spec, e.g. -info 1K:13:256\n");
         throw "-info <fft>";
       }
-      log(" FFT         | BPW   | Max exp (M)\n");
+      log(" FFT              | BPW   | Max exp (M)\n");
       for (const FFTShape& shape : FFTShape::multiSpec(s)) {
         for (u32 variant = 0; variant <= LAST_VARIANT; variant = next_variant (variant)) {
+          if (variant != LAST_VARIANT && shape.fft_type != FFT64) continue;
           FFTConfig fft{shape, variant, CARRY_AUTO};
           log("%12s | %.2f | %5.1f\n", fft.spec().c_str(), fft.maxBpw(), fft.maxExp() / 1'000'000.0);
         }
@@ -310,8 +308,8 @@ void Args::parse(const string& line) {
       assert(s.empty());
       logROE = true;
     } else if (key == "-tune") {
-      assert(s.empty());
       doTune = true;
+      if (!s.empty()) { tune = s; }
     } else if (key == "-ctune") {
       doCtune = true;
       if (!s.empty()) { ctune.push_back(s); }
@@ -372,6 +370,7 @@ void Args::parse(const string& line) {
     else if (key == "-iters") { iters = stoi(s); assert(iters && (iters % 10000 == 0)); }
     else if (key == "-prp" || key == "-PRP") { prpExp = stoll(s); }
     else if (key == "-ll" || key == "-LL") { llExp = stoll(s); }
+    else if (key == "-smallest") { smallest = true; }
     else if (key == "-fft") { fftSpec = s; }
     else if (key == "-dump") { dump = s; }
     else if (key == "-user") { user = s; }

diff --git a/src/Args.h b/src/Args.h
@@ -43,6 +43,7 @@ class Args {
   string uid;
   string verifyPath;
 
+  string tune;
   vector<string> ctune;
 
   bool doCtune{};
@@ -53,14 +54,15 @@ class Args {
 
   std::map<std::string, std::string> flags;
   std::map<std::string, vector<KeyVal>> perFftConfig;
-  
+
   int device = 0;
 
   bool safeMath = true;
   bool clean = true;
   bool verbose = false;
   bool useCache = false;
   bool profile = false;
+  bool smallest = false;
 
   fs::path masterDir;
   fs::path proofResultDir = "proof";

diff --git a/src/Background.h b/src/Background.h
@@ -16,10 +16,10 @@
 class Background {
   unsigned maxSize;
   std::deque<std::function<void()> > tasks;
-  std::jthread thread;
   std::mutex mut;
   std::condition_variable cond;
-  bool stopRequested{};
+  bool stopRequested;
+  std::jthread thread;
 
   void run() {
     std::function<void()> task;
@@ -59,6 +59,7 @@ class Background {
 public:
   Background(unsigned size = 2) :
     maxSize{size},
+    stopRequested(false),
     thread{&Background::run, this} {
   }
 

diff --git a/src/Buffer.h b/src/Buffer.h
@@ -26,7 +26,7 @@ class Buffer {
 
   Queue* queue;
   TimeInfo *tInfo;
-  
+
   Buffer(cl_context context, TimeInfo *tInfo, Queue* queue, size_t size, unsigned flags, const T* ptr = nullptr)
     : ptr{size == 0 ? NULL : makeBuf_(context, flags, size * sizeof(T), ptr)}
     , size{size}