Fix

tyb0807 · tyb0807 · commit 14626949fc9f · 2025-12-23T23:41:59.000+01:00
Signed-off-by: tyb0807 &lt;sontuan.vu@amd.com&gt;
diff --git a/water/include/water/Dialect/Wave/IR/WaveOps.td b/water/include/water/Dialect/Wave/IR/WaveOps.td
@@ -133,8 +133,9 @@ def MmaOp : WaveOp<"mma",
   let hasVerifier = 1;
 
   let extraClassDeclaration = [{
-    /// Compute the expected elements per thread for this MMA operation.
-    unsigned computeElementsPerThread();
+    /// Compute the expected elements per thread for a specific operand of this MMA operation.
+    /// Returns failure if no hardware constraints are available.
+    llvm::FailureOr<unsigned> computeElementsPerThreadForOperand(unsigned operandIndex);
   }];
 }
 
diff --git a/water/lib/Dialect/Wave/IR/WaveOps.cpp b/water/lib/Dialect/Wave/IR/WaveOps.cpp
@@ -1078,15 +1078,15 @@ LogicalResult MmaOp::verify() {
                                    accumulatorType.getElementType());
 }
 
-/// Compute the expected elements per thread for this MMA operation.
-/// Extracts threadsPerWave from ancestor operations with hardware constraints.
-/// Returns 0 if no constraints are found.
-unsigned wave::MmaOp::computeElementsPerThread() {
-  wave::WaveMmaKind kind = getKind();
-  if (!kind) {
-    return 0;
+/// Compute the expected elements per thread for a specific MMA operand.
+/// operandIndex: 0=LHS, 1=RHS, 2=Accumulator/Result
+/// Returns failure if no constraints are found.
+llvm::FailureOr<unsigned> wave::MmaOp::computeElementsPerThreadForOperand(unsigned operandIndex) {
+  if (!getKindAttr()) {
+    return mlir::failure();
   }
-  wave::WaveMmaSpec spec = wave::WaveMmaKindAttr::getSpec(getContext(), *kind);
+  wave::WaveMmaKind kind = getKind();
+  wave::WaveMmaSpec spec = wave::WaveMmaKindAttr::getSpec(getContext(), kind);
 
   // Extract threads per wave from hardware constraint by walking up the
   // ancestry.
@@ -1097,28 +1097,42 @@ unsigned wave::MmaOp::computeElementsPerThread() {
       for (mlir::Attribute constraint : constraints) {
         if (auto hardwareConstraint =
                 llvm::dyn_cast<wave::HardwareConstraintAttr>(constraint)) {
-          unsigned totalElements = spec.m * spec.n;
+          unsigned totalElements;
+          switch (operandIndex) {
+            case 0: // LHS: M x K
+              totalElements = spec.m * spec.k;
+              break;
+            case 1: // RHS: N x K
+              totalElements = spec.n * spec.k;
+              break;
+            case 2: // Accumulator/Result: M x N
+              totalElements = spec.m * spec.n;
+              break;
+            default:
+              return mlir::failure();
+          }
           return totalElements / hardwareConstraint.getThreadsPerWave();
         }
       }
     }
     op = op->getParentOp();
   }
 
-  // Return 0 to indicate failure if no constraints found.
-  return 0;
+  // Return failure if no constraints found.
+  return mlir::failure();
 }
 
 llvm::FailureOr<mlir::ChangeResult>
 wave::MmaOp::propagateElementsPerThreadForward(
     llvm::ArrayRef<wave::ElementsPerThreadLatticeValue> operandElements,
     llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> resultElements,
     llvm::raw_ostream &errs) {
-  unsigned expectedElementsPerThread = computeElementsPerThread();
-  if (expectedElementsPerThread == 0) {
+  llvm::FailureOr<unsigned> expectedElementsPerThreadResult = computeElementsPerThreadForOperand(getAccumulatorMutable().getOperandNumber());
+  if (llvm::failed(expectedElementsPerThreadResult)) {
     errs << "MMA operation has no hardware constraints available";
     return mlir::failure();
   }
+  unsigned expectedElementsPerThread = *expectedElementsPerThreadResult;
   wave::ElementsPerThreadLatticeValue expectedResult(expectedElementsPerThread);
   return wave::detail::checkAndPropagateElementsPerThreadFromConstant(
       expectedResult, llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(),
@@ -1133,48 +1147,78 @@ wave::MmaOp::propagateElementsPerThreadBackward(
   // For MMA, the accumulator should have the same elements per thread as the
   // result. The LHS and RHS operands may have different constraints based on
   // their dimensions.
-  unsigned expectedElementsPerThread = computeElementsPerThread();
-  if (expectedElementsPerThread == 0) {
+  // MMA operation always has exactly 3 operands: LHS, RHS, Accumulator
+  assert(operandElements.size() == 3 && "MMA operation must have exactly 3 operands");
+
+  unsigned lhsOperandNumber = getLhsMutable().getOperandNumber();
+  unsigned rhsOperandNumber = getRhsMutable().getOperandNumber();
+  unsigned accumulatorOperandNumber = getAccumulatorMutable().getOperandNumber();
+
+  // Compute expected elements per thread for each operand
+  llvm::FailureOr<unsigned> expectedLhsElementsPerThreadResult = computeElementsPerThreadForOperand(lhsOperandNumber);
+  llvm::FailureOr<unsigned> expectedRhsElementsPerThreadResult = computeElementsPerThreadForOperand(rhsOperandNumber);
+  llvm::FailureOr<unsigned> expectedAccumulatorElementsPerThreadResult = computeElementsPerThreadForOperand(accumulatorOperandNumber);
+
+  if (llvm::failed(expectedLhsElementsPerThreadResult) || llvm::failed(expectedRhsElementsPerThreadResult) || llvm::failed(expectedAccumulatorElementsPerThreadResult)) {
     errs << "MMA operation has no hardware constraints available";
     return mlir::failure();
   }
-  wave::ElementsPerThreadLatticeValue expectedAccumulator(
-      expectedElementsPerThread);
 
-  unsigned accumulatorOperandNumber =
-      getAccumulatorMutable().getOperandNumber();
+  unsigned expectedLhsElementsPerThread = *expectedLhsElementsPerThreadResult;
+  unsigned expectedRhsElementsPerThread = *expectedRhsElementsPerThreadResult;
+  unsigned expectedAccumulatorElementsPerThread = *expectedAccumulatorElementsPerThreadResult;
 
-  // Validate that LHS and RHS operands have concrete elements_per_thread
-  // values. We don't propagate to them, but we check they've been properly
-  // initialized. During analysis initialization, bottom values are acceptable -
-  // we return NoChange to let the analysis continue rather than failing.
-  // LHS (0) and RHS (1) operands.
-  bool allLhsRhsInitialized = true;
-  for (unsigned i = 0; i < 2 && i < operandElements.size(); ++i) {
-    if (operandElements[i].isBottom()) {
-      allLhsRhsInitialized = false;
-      break;
-    }
+  wave::ElementsPerThreadLatticeValue expectedLhs(expectedLhsElementsPerThread);
+  wave::ElementsPerThreadLatticeValue expectedRhs(expectedRhsElementsPerThread);
+  wave::ElementsPerThreadLatticeValue expectedAccumulator(expectedAccumulatorElementsPerThread);
+
+  // Propagate elements_per_thread to LHS operand using the helper function
+  llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> lhsOnly =
+      operandElements.slice(lhsOperandNumber, 1);
+
+  llvm::FailureOr<mlir::ChangeResult> lhsResult =
+      wave::detail::checkAndPropagateElementsPerThreadFromConstant(
+          expectedLhs,
+          llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(), lhsOnly,
+          "computed from MMA kind", "", "LHS operand", errs);
+
+  if (llvm::failed(lhsResult)) {
+    return llvm::failure();
   }
 
-  // If LHS/RHS operands are still at bottom, return NoChange to allow
-  // the analysis to continue. Forward propagation will initialize them.
-  if (!allLhsRhsInitialized) {
-    return mlir::ChangeResult::NoChange;
+  // Propagate elements_per_thread to RHS operand using the helper function
+  llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> rhsOnly =
+      operandElements.slice(rhsOperandNumber, 1);
+
+  llvm::FailureOr<mlir::ChangeResult> rhsResult =
+      wave::detail::checkAndPropagateElementsPerThreadFromConstant(
+          expectedRhs,
+          llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(), rhsOnly,
+          "computed from MMA kind", "", "RHS operand", errs);
+
+  if (llvm::failed(rhsResult)) {
+    return mlir::failure();
   }
 
   // Propagate to the accumulator operand.
-  if (operandElements.size() > accumulatorOperandNumber) {
-    llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> accumulatorOnly =
-        operandElements.slice(accumulatorOperandNumber, 1);
-
-    return wave::detail::checkAndPropagateElementsPerThreadFromConstant(
-        expectedAccumulator,
-        llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(), accumulatorOnly,
-        "computed from MMA kind", "", "accumulator operand", errs);
+  llvm::MutableArrayRef<wave::ElementsPerThreadLatticeValue> accumulatorOnly =
+      operandElements.slice(accumulatorOperandNumber, 1);
+
+  llvm::FailureOr<mlir::ChangeResult> accumulatorResult =
+      wave::detail::checkAndPropagateElementsPerThreadFromConstant(
+          expectedAccumulator,
+          llvm::ArrayRef<wave::ElementsPerThreadLatticeValue>(), accumulatorOnly,
+          "computed from MMA kind", "", "accumulator operand", errs);
+
+  if (llvm::failed(accumulatorResult)) {
+    return mlir::failure();
   }
 
-  return mlir::ChangeResult::NoChange;
+  // Return Change if any operand changed
+  return (*lhsResult == mlir::ChangeResult::Change ||
+          *rhsResult == mlir::ChangeResult::Change ||
+          *accumulatorResult == mlir::ChangeResult::Change) ?
+         mlir::ChangeResult::Change : mlir::ChangeResult::NoChange;
 }
 
 //-----------------------------------------------------------------------------
@@ -1355,6 +1399,7 @@ mlir::LogicalResult wave::RegisterOp::verify() {
   return mlir::success();
 }
 
+
 //-----------------------------------------------------------------------------
 // ExtractSliceOp
 //-----------------------------------------------------------------------------
diff --git a/water/test/Dialect/Wave/propagate-elements-per-thread.mlir b/water/test/Dialect/Wave/propagate-elements-per-thread.mlir
@@ -163,18 +163,18 @@ module {
 // -----
 
 module attributes {wave.normal_form = #wave.normal_form<full_types>} {
-func.func @mma_uninitialized_lhs(%mem1: !wave.tensor<[@N, @K] of f16, <global>>, %mem2: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
-  // LHS without elements_per_thread - this will remain uninitialized.
+func.func @mma_compute_lhs_from_rhs(%mem1: !wave.tensor<[@N, @K] of f16, <global>>, %mem2: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
+  // LHS without elements_per_thread - will be computed from RHS + MMA constraints.
   %lhs_init = arith.constant 0.0 : f16
   %lhs = wave.register %lhs_init : !wave.tensor<[@M, @K] of f16, <register>>
 
   // RHS properly initialized through read operation.
-  %rhs = wave.read %mem1 {elements_per_thread = 4} : (!wave.tensor<[@N, @K] of f16, <global>>) -> !wave.tensor<[@N, @K] of f16, <register>>
+  %rhs = wave.read %mem1 {elements_per_thread = 8} : (!wave.tensor<[@N, @K] of f16, <global>>) -> !wave.tensor<[@N, @K] of f16, <register>>
 
   // ACC properly initialized through read operation.
-  %acc = wave.read %mem2 {elements_per_thread = 4} : (!wave.tensor<[@M, @N] of f32, <global>>) -> !wave.tensor<[@M, @N] of f32, <register>>
+  %acc = wave.read %mem2 {elements_per_thread = 8} : (!wave.tensor<[@M, @N] of f32, <global>>) -> !wave.tensor<[@M, @N] of f32, <register>>
 
-  // expected-error @below {{failed to propagate elements per thread backward: MMA operand #0 (LHS) has uninitialized elements_per_thread}}
+  // LHS elements_per_thread computed via MMA backward propagation
   %result = wave.mma %lhs, %rhs, %acc {kind = #wave.mma_kind<f32_16x16x16_f16>} : (!wave.tensor<[@M, @K] of f16, <register>>, !wave.tensor<[@N, @K] of f16, <register>>, !wave.tensor<[@M, @N] of f32, <register>>) -> !wave.tensor<[@M, @N] of f32, <register>>
   return
 }
@@ -183,19 +183,61 @@ func.func @mma_uninitialized_lhs(%mem1: !wave.tensor<[@N, @K] of f16, <global>>,
 // -----
 
 module attributes {wave.normal_form = #wave.normal_form<full_types>} {
-func.func @mma_uninitialized_rhs(%mem1: !wave.tensor<[@M, @K] of f16, <global>>, %mem2: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
+func.func @mma_compute_rhs_from_lhs(%mem1: !wave.tensor<[@M, @K] of f16, <global>>, %mem2: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
   // LHS properly initialized through read operation.
-  %lhs = wave.read %mem1 {elements_per_thread = 4} : (!wave.tensor<[@M, @K] of f16, <global>>) -> !wave.tensor<[@M, @K] of f16, <register>>
+  %lhs = wave.read %mem1 {elements_per_thread = 8} : (!wave.tensor<[@M, @K] of f16, <global>>) -> !wave.tensor<[@M, @K] of f16, <register>>
 
-  // RHS without elements_per_thread - this will remain uninitialized.
+  // RHS without elements_per_thread - will be computed from LHS + MMA constraints.
   %rhs_init = arith.constant 0.0 : f16
   %rhs = wave.register %rhs_init : !wave.tensor<[@N, @K] of f16, <register>>
 
   // ACC properly initialized through read operation.
-  %acc = wave.read %mem2 {elements_per_thread = 4} : (!wave.tensor<[@M, @N] of f32, <global>>) -> !wave.tensor<[@M, @N] of f32, <register>>
+  %acc = wave.read %mem2 {elements_per_thread = 8} : (!wave.tensor<[@M, @N] of f32, <global>>) -> !wave.tensor<[@M, @N] of f32, <register>>
 
-  // expected-error @below {{failed to propagate elements per thread backward: MMA operand #1 (RHS) has uninitialized elements_per_thread}}
+  // RHS elements_per_thread computed via MMA backward propagation
   %result = wave.mma %lhs, %rhs, %acc {kind = #wave.mma_kind<f32_16x16x16_f16>} : (!wave.tensor<[@M, @K] of f16, <register>>, !wave.tensor<[@N, @K] of f16, <register>>, !wave.tensor<[@M, @N] of f32, <register>>) -> !wave.tensor<[@M, @N] of f32, <register>>
   return
 }
 }
+
+// -----
+
+// Test MMA can compute both LHS and RHS when both are uninitialized
+module attributes {wave.normal_form = #wave.normal_form<full_types>} {
+  func.func @mma_compute_both_lhs_rhs(%mem1: !wave.tensor<[@M, @K] of f16, <global>>, %mem2: !wave.tensor<[@N, @K] of f16, <global>>, %mem3: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
+    // Both LHS and RHS without elements_per_thread - can compute from MMA formulas
+    %lhs_init = arith.constant 0.0 : f16
+    %lhs = wave.register %lhs_init : !wave.tensor<[@M, @K] of f16, <register>>
+    %rhs_init = arith.constant 0.0 : f16
+    %rhs = wave.register %rhs_init : !wave.tensor<[@N, @K] of f16, <register>>
+
+    // ACC properly initialized through read operation.
+    %acc = wave.read %mem3 {elements_per_thread = 8} : (!wave.tensor<[@M, @N] of f32, <global>>) -> !wave.tensor<[@M, @N] of f32, <register>>
+
+    // With proper MMA formulas, we can now compute both LHS and RHS from constraints,
+    // so this should succeed instead of failing
+    %result = wave.mma %lhs, %rhs, %acc {kind = #wave.mma_kind<f32_16x16x16_f16>} : (!wave.tensor<[@M, @K] of f16, <register>>, !wave.tensor<[@N, @K] of f16, <register>>, !wave.tensor<[@M, @N] of f32, <register>>) -> !wave.tensor<[@M, @N] of f32, <register>>
+    return
+  }
+}
+
+// -----
+
+// Test MMA error when operand has wrong elements_per_thread
+module attributes {wave.normal_form = #wave.normal_form<full_types>} {
+  func.func @mma_operand_mismatch(%mem1: !wave.tensor<[@M, @K] of f16, <global>>, %mem2: !wave.tensor<[@M, @N] of f32, <global>>) attributes {wave.hyperparameters = #wave.hyperparameters<{M = 16, N = 16, K = 16}>, wave.constraints = [#wave.hardware_constraint<threads_per_wave = 32, waves_per_block = [1, 1, 1], mma_type = #wave.mma_kind<f32_16x16x16_f16>, vector_shapes = {M = 1, N = 1, K = 16}, max_bits_per_load = 128>]} {
+    // LHS with wrong elements_per_thread (should be 8, not 4)
+    %lhs = wave.read %mem1 {elements_per_thread = 4} : (!wave.tensor<[@M, @K] of f16, <global>>) -> !wave.tensor<[@M, @K] of f16, <register>>
+
+    // RHS without elements_per_thread - will be computed from MMA constraints.
+    %rhs_init = arith.constant 0.0 : f16
+    %rhs = wave.register %rhs_init : !wave.tensor<[@N, @K] of f16, <register>>
+
+    // ACC properly initialized
+    %acc = wave.read %mem2 {elements_per_thread = 8} : (!wave.tensor<[@M, @N] of f32, <global>>) -> !wave.tensor<[@M, @N] of f32, <register>>
+
+    // expected-error @below {{failed to propagate elements per thread backward: mismatch between computed from MMA kind (8) and LHS operand #0 (4)}}
+    %result = wave.mma %lhs, %rhs, %acc {kind = #wave.mma_kind<f32_16x16x16_f16>} : (!wave.tensor<[@M, @K] of f16, <register>>, !wave.tensor<[@N, @K] of f16, <register>>, !wave.tensor<[@M, @N] of f32, <register>>) -> !wave.tensor<[@M, @N] of f32, <register>>
+    return
+  }
+}