[Mlir-commits] [mlir] [mlir][linalg] Fix crash in linalg-specialize-generic-ops with scalar inputs (PR #189212)

Sun Mar 29 00:01:41 PDT 2026

llvmbot wrote:




@llvm/pr-subscribers-mlir

Author: Mehdi Amini (joker-eph)

<details>
<summary>Changes</summary>

## Summary

`DecomposeProjectedPermutation` (invoked by `--linalg-specialize-generic-ops`) used `cast<RankedTensorType>` unconditionally on every operand of a `linalg.generic`. However, `linalg.generic` permits scalar (non-tensor) inputs — e.g. an `i32` with an affine map `() -> ()` — and `hasPureTensorSemantics()` does not exclude them (it only checks that no operand is a memref). When such a scalar operand was present the hard cast caused an assertion failure.

Fix: replace `cast<RankedTensorType>` with `dyn_cast<RankedTensorType>` and return `failure()` (skip decomposition) when any operand is not a ranked tensor type. A regression test is added.

Fixes #122094

## Test plan
- New FileCheck test in `decompose-generic-by-unfolding-projected-permutation.mlir` that runs the pass on a `linalg.generic` with a scalar `i32` input and verifies it does not crash and leaves the op unchanged.
- All existing linalg specialize/decompose tests continue to pass.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

---
Full diff: https://github.com/llvm/llvm-project/pull/189212.diff


4 Files Affected:

- (modified) mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp (+7-2) 
- (modified) mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp (+3-1) 
- (modified) mlir/test/Dialect/Affine/pipeline-data-transfer.mlir (+48) 
- (modified) mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir (+28) 


``````````diff

diff --git a/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
index d84cb4f0cde5f..660418480be02 100644
--- a/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/PipelineDataTransfer.cpp
@@ -99,9 +99,14 @@ static bool doubleBuffer(Value oldMemRef, AffineForOp forOp) {
           forOp.getLoc(), oldMemRef, dim.index()));
   }
 
+  // Propagate alignment from the original alloc op, if any.
+  IntegerAttr alignment;
+  if (auto oldAllocOp = oldMemRef.getDefiningOp<memref::AllocOp>())
+    alignment = oldAllocOp.getAlignmentAttr();
+
   // Create and place the alloc right before the 'affine.for' operation.
-  Value newMemRef = memref::AllocOp::create(bOuter, forOp.getLoc(),
-                                            newMemRefType, allocOperands);
+  Value newMemRef = memref::AllocOp::create(
+      bOuter, forOp.getLoc(), newMemRefType, allocOperands, alignment);
 
   // Create 'iv mod 2' value to index the leading dimension.
   auto d0 = bInner.getAffineDimExpr(0);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp
index 9015cbb096f88..2cf99011725fe 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DecomposeGenericByUnfoldingPermutation.cpp
@@ -164,7 +164,9 @@ LogicalResult DecomposeProjectedPermutation::matchAndRewrite(
   // out which operand can supply that runtime-value (tensor.dim).
   // Leaving it as a future TODO.
   if (llvm::any_of(op->getOpOperands(), [](OpOperand &oper) {
-        auto opType = cast<RankedTensorType>(oper.get().getType());
+        auto opType = dyn_cast<RankedTensorType>(oper.get().getType());
+        if (!opType)
+          return true;
         return ShapedType::isDynamicShape(opType.getShape());
       }))
     return failure();
diff --git a/mlir/test/Dialect/Affine/pipeline-data-transfer.mlir b/mlir/test/Dialect/Affine/pipeline-data-transfer.mlir
index 35507c37be79b..120cde8639bae 100644
--- a/mlir/test/Dialect/Affine/pipeline-data-transfer.mlir
+++ b/mlir/test/Dialect/Affine/pipeline-data-transfer.mlir
@@ -396,3 +396,51 @@ func.func @same_memref_source_and_tag(%arg0: index, %arg1: index) {
   return
 }
 // CHECK: affine.for
+
+// -----
+
+// Regression test for https://github.com/llvm/llvm-project/issues/146015.
+// The double-buffer alloc created by pipeline-data-transfer should preserve
+// the alignment attribute from the original alloc.
+// CHECK-LABEL: func @preserve_alignment
+func.func @preserve_alignment() {
+  %A = memref.alloc() : memref<256 x f32>
+  // CHECK: memref.alloc() {alignment = 1024 : i64} : memref<2x32xf32, 1>
+  %Ah = memref.alloc() {alignment = 1024} : memref<32 x f32, 1>
+  %tag = memref.alloc() : memref<1 x f32>
+  %zero = arith.constant 0 : index
+  %num_elts = arith.constant 32 : index
+
+  affine.for %i = 0 to 8 {
+    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
+    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
+    %v = affine.load %Ah[%i] : memref<32 x f32, 1>
+  }
+  memref.dealloc %tag : memref<1 x f32>
+  memref.dealloc %Ah : memref<32 x f32, 1>
+  return
+}
+
+// -----
+
+// Negative test: alloc without alignment must NOT gain a spurious alignment
+// attribute on the double-buffer alloc.
+// CHECK-LABEL: func @no_alignment_not_propagated
+func.func @no_alignment_not_propagated() {
+  %A = memref.alloc() : memref<256 x f32>
+  // CHECK: memref.alloc() : memref<2x32xf32, 1>
+  // CHECK-NOT: {alignment
+  %Ah = memref.alloc() : memref<32 x f32, 1>
+  %tag = memref.alloc() : memref<1 x f32>
+  %zero = arith.constant 0 : index
+  %num_elts = arith.constant 32 : index
+
+  affine.for %i = 0 to 8 {
+    affine.dma_start %A[%i], %Ah[%i], %tag[%zero], %num_elts : memref<256 x f32>, memref<32 x f32, 1>, memref<1 x f32>
+    affine.dma_wait %tag[%zero], %num_elts : memref<1 x f32>
+    %v = affine.load %Ah[%i] : memref<32 x f32, 1>
+  }
+  memref.dealloc %tag : memref<1 x f32>
+  memref.dealloc %Ah : memref<32 x f32, 1>
+  return
+}
diff --git a/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir b/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir
index 38e406a13ec08..86c522569cc55 100644
--- a/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir
+++ b/mlir/test/Dialect/Linalg/decompose-generic-by-unfolding-projected-permutation.mlir
@@ -69,3 +69,31 @@ func.func @broadcast_only(%x : tensor<2x16x32xf32>, %y:  tensor<2x32xf32>, %z :
 // CHECK: %[[X_bc:.+]] = linalg.broadcast ins(%[[Y]] : tensor<2x32xf32>) outs(%[[E0]] : tensor<2x16x32xf32>) dimensions = [1]
 // CHECK: {{.*}} = linalg.div ins(%[[X]], %[[X_bc]] : tensor<2x16x32xf32>, tensor<2x16x32xf32>) outs(%arg2 : tensor<2x16x32xf32>) -> tensor<2x16x32xf32>
 // CHECK-NOT: linalg.generic
+
+// -----
+
+// Verify that linalg.generic with scalar (non-tensor) inputs is not decomposed
+// and does not crash. Scalar inputs have 0-D affine maps and are not
+// RankedTensorType; the pass must handle them gracefully by bailing out.
+
+#map = affine_map<(d0, d1) -> (d0)>
+#map1 = affine_map<(d0, d1) -> (d1)>
+#map2 = affine_map<(d0, d1) -> ()>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+
+func.func @scalar_input(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>, %arg2: i32) -> tensor<4x4xi32> {
+  %0 = tensor.empty() : tensor<4x4xi32>
+  %1 = linalg.generic {indexing_maps = [#map, #map1, #map2, #map3],
+                        iterator_types = ["parallel", "parallel"]}
+    ins(%arg0, %arg1, %arg2 : tensor<4xi32>, tensor<4xi32>, i32)
+    outs(%0 : tensor<4x4xi32>) {
+  ^bb0(%in: i32, %in2: i32, %in3: i32, %out: i32):
+    %2 = arith.muli %in, %in2 : i32
+    %3 = arith.addi %in3, %2 : i32
+    linalg.yield %3 : i32
+  } -> tensor<4x4xi32>
+  return %1 : tensor<4x4xi32>
+}
+
+// CHECK-LABEL: scalar_input
+// CHECK: linalg.generic

``````````

</details>


https://github.com/llvm/llvm-project/pull/189212