ROCm · mirza-halilcevic · Dec 18, 2025 · Dec 19, 2025 · Dec 19, 2025 · Dec 19, 2025
@@ -0,0 +1,71 @@
+//===- ConcurrentQueue.h - Simple MPMC queue --------------------*- C++ -*-===//
+//
+// Part of the rocMLIR Project, under the Apache License v2.0 with LLVM
+// Exceptions. See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ROCMLIR_TUNING_DRIVER_CONCURRENT_QUEUE_H
+#define ROCMLIR_TUNING_DRIVER_CONCURRENT_QUEUE_H
+
+#include "llvm/Support/Compiler.h"
+
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <queue>
+
+namespace rocmlir::tuningdriver {
+
+template <typename T>
+class ConcurrentQueue {
+public:
+  template <typename U>
+  bool push(U &&item) {
+    if (LLVM_UNLIKELY(done.load(std::memory_order_relaxed)))
+      return false; // Early exit if terminated
+
+    {
+      std::lock_guard<std::mutex> lock(mtx);
+      if (LLVM_UNLIKELY(done.load(std::memory_order_relaxed)))
+        return false; // Double-check after acquiring the lock
+
+      queue.emplace(std::forward<U>(item));
+    }
+
+    cv.notify_one();
+    return true;
+  }
+
+  bool pop(T &item) {
+    std::unique_lock<std::mutex> lock(mtx);
+    cv.wait(lock, [this] {
+      return !queue.empty() || done.load(std::memory_order_relaxed);
+    });
+
+    if (LLVM_UNLIKELY(queue.empty()))
+      return false;
+
+    item = std::move(queue.front());
+    queue.pop();
+    return true;
+  }
+
+  void terminate() {
+    done.store(true, std::memory_order_relaxed);
+    cv.notify_all();
+  }
+
+  bool isTerminated() const { return done.load(std::memory_order_relaxed); }
+
+private:
+  std::queue<T> queue;
+  std::mutex mtx;
+  std::condition_variable cv;
+  std::atomic<bool> done{false};
+};
+
+} // namespace rocmlir::tuningdriver
+
+#endif // ROCMLIR_TUNING_DRIVER_CONCURRENT_QUEUE_H
@@ -58,7 +58,9 @@
 
 // Utilities to allocate buffers
 #include "../utils/performance/common/benchmarkUtils.h"
+
 #include "CacheFlush.h"
+#include "ConcurrentQueue.h"
 
 #include <hip/hip_runtime.h>
 
@@ -160,6 +162,11 @@ static llvm::cl::opt<unsigned> numCompileThreads(
     llvm::cl::desc("Number of parallel compilation threads (0 = auto)"),
     llvm::cl::value_desc("thread count"), llvm::cl::init(0));
 
+static llvm::cl::opt<bool> waitForCompiles(
+    "wait-for-compiles",
+    llvm::cl::desc("Wait for all compilations to finish before benchmarking"),
+    llvm::cl::init(false));
+
 // Ripped out of JitRunner.cpp
 static OwningOpRef<ModuleOp> parseMLIRInput(StringRef inputFilename,
                                             MLIRContext *context) {
@@ -276,6 +283,7 @@ struct BenchmarkParams {
   rock::TuningParamSetKind tuningSpaceKind;
   const unsigned numCompileThreads;
   std::string benchmarkConfig;
+  bool waitForCompiles;
 };
 
 enum class CompilationStatus {
@@ -740,7 +748,7 @@ static LogicalResult runTuningLoop(ModuleOp source) {
   const BenchmarkParams benchmarkParams = {
       numIterations,     warmupIterations, useMedian,           trimPercent,
       sleepUs,           showStats,        showAllMeasurements, tuningSpaceKind,
-      numCompileThreads, benchmarkConfig};
+      numCompileThreads, benchmarkConfig,  waitForCompiles};
 
   unsigned numTuningIterations =
       rock::getNumberOfIterations(benchmarkParams.tuningSpaceKind);
@@ -827,10 +835,8 @@ static LogicalResult runTuningLoop(ModuleOp source) {
     }
 
     // PHASE 3: Parallel compilation phase using pre-initialized resources
-    std::vector<CompilationResult> compilationResults(configs.size());
+    ConcurrentQueue<CompilationResult> compilationResults;
     std::mutex outputMutex; // For thread-safe console output
-    std::atomic<bool> compilationFailed{
-        false}; // Flag to signal early termination
 
     // Compile a single config using pre-initialized thread resources
     auto compileConfig = [&](size_t idx,
@@ -876,7 +882,6 @@ static LogicalResult runTuningLoop(ModuleOp source) {
         auto tunedFunc = sourceCopy->lookupSymbol<func::FuncOp>(fnName);
         if (!tunedFunc) {
           result.status = CompilationStatus::CompilationFailed;
-          compilationFailed.store(true, std::memory_order_relaxed);
           return result;
         }
         result.blockSizes.push_back(
@@ -891,7 +896,6 @@ static LogicalResult runTuningLoop(ModuleOp source) {
         llvm::errs() << "Backend pipeline failed for config: "
                      << result.perfConfig << "\n";
         result.status = CompilationStatus::CompilationFailed;
-        compilationFailed.store(true, std::memory_order_relaxed);
         return result;
       }
 
@@ -901,7 +905,6 @@ static LogicalResult runTuningLoop(ModuleOp source) {
             sourceCopy->lookupSymbol<gpu::BinaryOp>(fnName + "_module");
         if (!binary) {
           result.status = CompilationStatus::CompilationFailed;
-          compilationFailed.store(true, std::memory_order_relaxed);
           return result;
         }
         result.hipModules.push_back(
@@ -920,53 +923,65 @@ static LogicalResult runTuningLoop(ModuleOp source) {
     // compilation times vary dramatically between configs (NotApplicable is
     // fast, full compilation is slow). Dynamic work stealing provides better
     // load balancing by allowing fast threads to pick up more work.
-    {
-      std::atomic<size_t> nextIdx{0};
-      std::atomic<unsigned> nextThreadId{0};
-
-      auto worker = [&]() {
-        // Each worker gets assigned a unique thread ID for its resources
-        unsigned myThreadId =
-            nextThreadId.fetch_add(1, std::memory_order_relaxed);
-        ThreadResources &myRes = threadResources[myThreadId];
-
-        while (true) {
-          if (compilationFailed.load(std::memory_order_relaxed))
-            break;
+    std::atomic<size_t> nextIdx{0};
+    std::atomic<unsigned> nextThreadId{0};
+    std::atomic<size_t> activeThreads{numThreads};
+    auto worker = [&] {
+      // Each worker gets assigned a unique thread ID for its resources
+      unsigned myThreadId =
+          nextThreadId.fetch_add(1, std::memory_order_relaxed);
+      ThreadResources &myRes = threadResources[myThreadId];
+
+      while (true) {
+        size_t idx = nextIdx.fetch_add(1, std::memory_order_relaxed);
+        if (idx >= configs.size())
+          break;
+
+        if (compilationResults.isTerminated())
+          break; // Avoid unnecessary work
+
+        if (!compilationResults.push(compileConfig(idx, myRes)))
+          break; // Queue terminated
+      }
 
-          size_t idx = nextIdx.fetch_add(1, std::memory_order_relaxed);
-          if (idx >= configs.size())
-            break;
+      if (activeThreads.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+        // Last thread - signal termination
+        compilationResults.terminate();
+      }
+    };
 
-          compilationResults[idx] = compileConfig(idx, myRes);
-        }
-      };
+    std::vector<std::thread> threads;
+    threads.reserve(numThreads);
+    for (unsigned i = 0; i < numThreads; ++i) {
+      threads.emplace_back(worker);
+    }
 
-      std::vector<std::thread> threads;
-      threads.reserve(numThreads);
-      for (unsigned i = 0; i < numThreads; ++i) {
-        threads.emplace_back(worker);
+    auto threadCleanup = llvm::make_scope_exit([&] {
+      // In case of early termination, signal all threads to stop
+      compilationResults.terminate();
+      for (auto &t : threads) {
+        t.join();
       }
+    });
 
+    if (benchmarkParams.waitForCompiles) {
       for (auto &t : threads) {
         t.join();
       }
-    }
-
-    // Check if any compilation failed and terminate early
-    if (compilationFailed.load(std::memory_order_relaxed)) {
-      llvm::errs()
-          << "Compilation failed for one or more configs. Terminating.\n";
-      return failure();
+      threads.clear();
     }
 
     int64_t validResults = 0;
     // Sequential benchmarking phase (must be sequential for accurate timing)
-    // Note: Due to early exit on compilation failures, only NotApplicable and
-    // Success statuses are possible here.
-    for (const auto &result : compilationResults) {
+    CompilationResult result;
+    while (compilationResults.pop(result)) {
       llvm::outs() << result.perfConfig << "\t";
 
+      if (result.status == CompilationStatus::CompilationFailed) {
+        llvm::errs() << "Compilation failed\n";
+        return failure();
+      }
+
       if (result.status == CompilationStatus::NotApplicable) {
         llvm::outs() << "N/A\n";
         continue;

@@ -1179,10 +1179,10 @@ PY
                                                     stage("Tune Fusion") {
                                                         dir('build') {
                                                             // Tune resnet50
-                                                            sh """python3 ./bin/tuningRunner.py --abort-on-error --op fusion --test_dir ../mlir/test/fusion/resnet50-e2e/ -o tuning_fusion_${CHIP}.tsv"""
+                                                            sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error --op fusion --test-dir ../mlir/test/fusion/resnet50-e2e/ -o tuning_fusion_${CHIP}.tsv"""
 
                                                             // Tune bert
-                                                            sh """python3 ./bin/tuningRunner.py --abort-on-error --op fusion --test_dir ../mlir/test/xmir/bert-torch-tosa-e2e/ -o tuning_fusion_${CHIP}.tsv"""
+                                                            sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error --op fusion --test-dir ../mlir/test/xmir/bert-torch-tosa-e2e/ -o tuning_fusion_${CHIP}.tsv"""
                                                         }
                                                         sh 'rm -f build/CMakeCache.txt'
                                                     }

@@ -150,14 +150,14 @@ pipeline {
                                             dir('build') {
                                                 timeout(time: 60, activity: true, unit: 'MINUTES') {
                                                     // Tune gemms, fail if the DB is not created
-                                                    sh """python3 ./bin/tuningRunner.py --abort-on-error \
+                                                    sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error \
                                                             --operation gemm \
-                                                            --configs_file=../mlir/utils/jenkins/ci-configs/selected-gemm-configs \
+                                                            --configs-file=../mlir/utils/jenkins/ci-configs/selected-gemm-configs \
                                                             --output=tuning_gemm.tsv
                                                         [ -f tuning_gemm.tsv ]"""
-                                                    sh """python3 ./bin/tuningRunner.py --abort-on-error \
+                                                    sh """python3 ./bin/tuningRunner.py --quiet --abort-on-error \
                                                             --operation conv \
-                                                            --configs_file=../mlir/utils/jenkins/ci-configs/selected-conv-configs \
+                                                            --configs-file=../mlir/utils/jenkins/ci-configs/selected-conv-configs \
                                                             --output=tuning_conv.tsv
                                                         [ -f tuning_conv.tsv ]"""
                                                 }

@@ -529,15 +529,16 @@ def from_command_line(cls, argv, arch, num_cu):
             datatype = 'bf8_fp8'
         elif argv[0] == 'convbf8_bf8':
             datatype = 'bf8_bf8'
+        else:
+            raise ValueError(f"Unknown conv datatype: {argv[0]}")
 
         try:
             # TBD:
             # implement -m ?
             # implement -t ?
             opts, _ = getopt.getopt(argv[1:], "F:f:I:O:n:c:H:W:k:y:x:p:q:l:j:u:v:g:m:t:")
-        except getopt.GetoptError:
-            print('getopt error')
-            sys.exit(1)
+        except getopt.GetoptError as e:
+            raise ValueError(f"Invalid conv config: {e}")
 
         for opt, arg in opts:
             if opt == '-F':