[llvm] [mlir] [MLIR] Add `InParallelOpInterface` for parallel combining operations (PR #157736)

Fri Sep 12 13:58:06 PDT 2025

https://github.com/lialan updated https://github.com/llvm/llvm-project/pull/157736

>From 0f5c62740b1c25f0d1096c806e8b83fda44b1f0e Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Sun, 7 Sep 2025 22:41:16 -0400
Subject: [PATCH 01/11] InParallelOpInterface.

---
 .../mlir/Dialect/Tensor/IR/TensorOps.td       |   2 +
 .../Interfaces/ParallelCombiningOpInterface.h |   1 +
 .../ParallelCombiningOpInterface.td           |  29 +++++
 .../TransformOps/LinalgTransformOps.cpp       |  14 ++-
 .../Linalg/Transforms/DropUnitDims.cpp        |   8 +-
 mlir/lib/Dialect/SCF/IR/SCF.cpp               |  55 +++++----
 .../BufferDeallocationOpInterfaceImpl.cpp     |   6 +-
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      |  22 +++-
 .../BufferizableOpInterfaceImpl.cpp           |   4 +-
 .../Tensor/Transforms/FoldTensorSubsetOps.cpp |   6 +-
 .../ParallelCombiningOpInterface.cpp          |  37 +++++-
 .../Dialect/Linalg/drop-unit-extent-dims.mlir |  26 ++++-
 mlir/test/Dialect/SCF/invalid.mlir            |  10 +-
 .../SCF/one-shot-bufferize-analysis.mlir      | 108 ++++++++++++++++++
 ...-shot-bufferize-tensor-copy-insertion.mlir |  30 ++++-
 15 files changed, 298 insertions(+), 60 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 7d396e5c64c28..842a76e8fe90f 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1474,6 +1474,8 @@ def Tensor_PadOp : Tensor_Op<"pad", [
 def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
        AttrSizedOperandSegments,
        OffsetSizeAndStrideOpInterface,
+       DeclareOpInterfaceMethods<InParallelOpInterface,
+          ["getUpdatedDestinations", "getIteratingParent"]>,
        // TODO: Cannot use an interface here atm, verify this manually for now.
        // HasParent<"ParallelCombiningOpInterface">
   ]> {
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
index 72db06163df37..e3441b8322d96 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
@@ -20,6 +20,7 @@ namespace mlir {
 namespace detail {
 // TODO: Single region single block interface on interfaces ?
 LogicalResult verifyParallelCombiningOpInterface(Operation *op);
+LogicalResult verifyInParallelOpInterface(Operation *op);
 } // namespace detail
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
index 424b4cf0a0a58..86eaf2c95462c 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
@@ -56,4 +56,33 @@ def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   }];
 }
 
+def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
+  let description = [{
+    An in_parallel op is an operation that inserts into a shared tensor in
+    conjunction with a parent combining and iterating op.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Returns the list of values updated by this op.
+      }],
+      /*retTy=*/"::mlir::MutableOperandRange",
+      /*methodName=*/"getUpdatedDestinations",
+      /*args=*/(ins)
+    >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns the iterating parent for this op.
+      }],
+      /*retTy=*/"::mlir::Operation*",
+      /*methodName=*/"getIteratingParent",
+      /*args=*/(ins)
+    >,
+  ];
+  let verify = [{
+    return ::mlir::detail::verifyInParallelOpInterface($_op);
+  }];
+}
+
 #endif // MLIR_INTERFACES_PARALLELCOMBININGOPINTERFACE
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index f0c1f4485b054..840737fdb836b 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -36,6 +36,7 @@
 #include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ParallelCombiningOpInterface.h"
 #include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -4140,11 +4141,14 @@ DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
     return DiagnosedSilenceableFailure::success();
   }
 
-  // If we are inside an InParallel region, temporarily set the insertion point
-  // outside: only tensor.parallel_insert_slice ops are allowed in there.
-  if constexpr (std::is_same_v<OpTy, tensor::ParallelInsertSliceOp>) {
-    rewriter.setInsertionPoint(
-        target->template getParentOfType<scf::InParallelOp>());
+  // If we are inside an ParallelCombiningOp region, temporarily set the
+  // insertion point outside: only ops implementing InParallelOpInterface are
+  // allowed in there.
+  if (isa<mlir::InParallelOpInterface>(target.getOperation())) {
+    if (auto combiningParent =
+            dyn_cast<ParallelCombiningOpInterface>(target->getParentOp())) {
+      rewriter.setInsertionPoint(target->getParentOp());
+    }
   }
 
   Value extracted = tensor::ExtractSliceOp::create(
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 22690daa4f9e1..9eea88fb5a837 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -784,8 +784,12 @@ struct RankReducedInsertSliceOp : public OpRewritePattern<InsertOpTy> {
       // The only difference between InsertSliceOp and ParallelInsertSliceOp
       // is the insertion point is just before the ParallelCombiningOp in the
       // parallel case.
-      if (std::is_same<InsertOpTy, tensor::ParallelInsertSliceOp>::value)
-        rewriter.setInsertionPoint(insertSliceOp->getParentOp());
+      if (std::is_same<InsertOpTy, tensor::ParallelInsertSliceOp>::value) {
+        if (auto combiningParent = dyn_cast<ParallelCombiningOpInterface>(
+                insertSliceOp->getParentOp())) {
+          rewriter.setInsertionPoint(insertSliceOp->getParentOp());
+        }
+      }
       reshapedSource = tensor::CollapseShapeOp::create(
           rewriter, loc, insertSliceOp.getSource(), *reassociation);
     }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 84f9777a443fd..873dbbde48b37 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -680,8 +680,11 @@ void mlir::scf::promote(RewriterBase &rewriter, scf::ForallOp forallOp) {
   SmallVector<Value> results;
   results.reserve(forallOp.getResults().size());
   for (auto &yieldingOp : terminator.getYieldingOps()) {
+    // Skip non-ParallelInsertSliceOp operations
     auto parallelInsertSliceOp =
-        cast<tensor::ParallelInsertSliceOp>(yieldingOp);
+        dyn_cast<tensor::ParallelInsertSliceOp>(yieldingOp);
+    if (!parallelInsertSliceOp)
+      continue;
 
     Value dst = parallelInsertSliceOp.getDest();
     Value src = parallelInsertSliceOp.getSource();
@@ -1437,14 +1440,12 @@ InParallelOp ForallOp::getTerminator() {
   return cast<InParallelOp>(getBody()->getTerminator());
 }
 
+
 SmallVector<Operation *> ForallOp::getCombiningOps(BlockArgument bbArg) {
   SmallVector<Operation *> storeOps;
-  InParallelOp inParallelOp = getTerminator();
-  for (Operation &yieldOp : inParallelOp.getYieldingOps()) {
-    if (auto parallelInsertSliceOp =
-            dyn_cast<tensor::ParallelInsertSliceOp>(yieldOp);
-        parallelInsertSliceOp && parallelInsertSliceOp.getDest() == bbArg) {
-      storeOps.push_back(parallelInsertSliceOp);
+  for (Operation *user : bbArg.getUsers()) {
+    if (auto parallelOp = dyn_cast<InParallelOpInterface>(user)) {
+      storeOps.push_back(parallelOp);
     }
   }
   return storeOps;
@@ -1673,7 +1674,12 @@ struct ForallOpIterArgsFolder : public OpRewritePattern<ForallOp> {
     for (OpResult result : forallOp.getResults()) {
       OpOperand *opOperand = forallOp.getTiedOpOperand(result);
       BlockArgument blockArg = forallOp.getTiedBlockArgument(opOperand);
-      if (result.use_empty() || forallOp.getCombiningOps(blockArg).empty()) {
+      SmallVector<Operation *> combiningOps =
+          forallOp.getCombiningOps(blockArg);
+      if ((result.use_empty() &&
+           llvm::all_of(combiningOps,
+                        [](Operation *op) { return op->use_empty(); })) ||
+          combiningOps.empty()) {
         resultToDelete.insert(result);
       } else {
         resultToReplace.push_back(result);
@@ -1911,8 +1917,9 @@ struct FoldTensorCastOfOutputIntoForallOp
     auto terminator = newForallOp.getTerminator();
     for (auto [yieldingOp, outputBlockArg] : llvm::zip(
              terminator.getYieldingOps(), newForallOp.getRegionIterArgs())) {
-      auto insertSliceOp = cast<tensor::ParallelInsertSliceOp>(yieldingOp);
-      insertSliceOp.getDestMutable().assign(outputBlockArg);
+      auto insertSliceOp = dyn_cast<tensor::ParallelInsertSliceOp>(yieldingOp);
+      if (insertSliceOp)
+        insertSliceOp.getDestMutable().assign(outputBlockArg);
     }
 
     // Cast results back to the original types.
@@ -1971,19 +1978,6 @@ LogicalResult InParallelOp::verify() {
   if (!forallOp)
     return this->emitOpError("expected forall op parent");
 
-  // TODO: InParallelOpInterface.
-  for (Operation &op : getRegion().front().getOperations()) {
-    if (!isa<tensor::ParallelInsertSliceOp>(op)) {
-      return this->emitOpError("expected only ")
-             << tensor::ParallelInsertSliceOp::getOperationName() << " ops";
-    }
-
-    // Verify that inserts are into out block arguments.
-    Value dest = cast<tensor::ParallelInsertSliceOp>(op).getDest();
-    ArrayRef<BlockArgument> regionOutArgs = forallOp.getRegionOutArgs();
-    if (!llvm::is_contained(regionOutArgs, dest))
-      return op.emitOpError("may only insert into an output block argument");
-  }
   return success();
 }
 
@@ -2018,12 +2012,15 @@ OpResult InParallelOp::getParentResult(int64_t idx) {
 }
 
 SmallVector<BlockArgument> InParallelOp::getDests() {
-  return llvm::to_vector<4>(
-      llvm::map_range(getYieldingOps(), [](Operation &op) {
-        // Add new ops here as needed.
-        auto insertSliceOp = cast<tensor::ParallelInsertSliceOp>(&op);
-        return llvm::cast<BlockArgument>(insertSliceOp.getDest());
-      }));
+  SmallVector<BlockArgument> updatedDests;
+  for (auto &yieldingOp : getYieldingOps()) {
+    auto inParallelOp = dyn_cast<InParallelOpInterface>(&yieldingOp);
+    if (!inParallelOp)
+      continue;
+    for (auto &updatedOperand : inParallelOp.getUpdatedDestinations())
+      updatedDests.push_back(cast<BlockArgument>(updatedOperand.get()));
+  }
+  return updatedDests;
 }
 
 llvm::iterator_range<Block::iterator> InParallelOp::getYieldingOps() {
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
index a44612410bdee..d70392131df51 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
@@ -40,8 +40,8 @@ namespace {
 ///   <implicit in_parallel terminator here>
 /// }
 /// ```
-struct InParallelOpInterface
-    : public BufferDeallocationOpInterface::ExternalModel<InParallelOpInterface,
+struct InParallelDeallocOpInterface
+    : public BufferDeallocationOpInterface::ExternalModel<InParallelDeallocOpInterface,
                                                           scf::InParallelOp> {
   FailureOr<Operation *> process(Operation *op, DeallocationState &state,
                                  const DeallocationOptions &options) const {
@@ -75,7 +75,7 @@ struct ReduceReturnOpInterface
 void mlir::scf::registerBufferDeallocationOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, SCFDialect *dialect) {
-    InParallelOp::attachInterface<InParallelOpInterface>(*ctx);
+    InParallelOp::attachInterface<InParallelDeallocOpInterface>(*ctx);
     ReduceReturnOp::attachInterface<ReduceReturnOpInterface>(*ctx);
   });
 }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 68584ec4fd814..3770690c21a03 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2978,7 +2978,7 @@ class InsertSliceOpConstantArgumentFolder final
       // The only difference between InsertSliceOp and ParallelInsertSliceOp
       // is that the insertion point is just before the ParallelCombiningOp in
       // the parallel case.
-      if (std::is_same<InsertOpTy, ParallelInsertSliceOp>::value)
+      if (isa<ParallelCombiningOpInterface>(insertSliceOp->getParentOp()))
         rewriter.setInsertionPoint(insertSliceOp->getParentOp());
       toInsert = tensor::CastOp::create(rewriter, insertSliceOp.getLoc(),
                                         sourceType, toInsert);
@@ -3155,7 +3155,7 @@ struct InsertSliceOpSourceCastInserter final
     // The only difference between InsertSliceOp and ParallelInsertSliceOp is
     // that the insertion point is just before the ParallelCombiningOp in the
     // parallel case.
-    if (std::is_same<InsertOpTy, ParallelInsertSliceOp>::value)
+    if (isa<ParallelCombiningOpInterface>(insertSliceOp->getParentOp()))
       rewriter.setInsertionPoint(insertSliceOp->getParentOp());
     Value cast = tensor::CastOp::create(rewriter, insertSliceOp.getLoc(),
                                         newSrcType, insertSliceOp.getSource());
@@ -3901,10 +3901,6 @@ void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
 }
 
 LogicalResult ParallelInsertSliceOp::verify() {
-  if (!isa<ParallelCombiningOpInterface>(getOperation()->getParentOp()))
-    return this->emitError("expected ParallelCombiningOpInterface parent, got:")
-           << *(getOperation()->getParentOp());
-
   // Verify result type against inferred type.
   RankedTensorType expectedType;
   SliceVerificationResult result =
@@ -3935,6 +3931,20 @@ llvm::SmallBitVector ParallelInsertSliceOp::getDroppedDims() {
   return ::getDroppedDims(getSourceType().getShape(), getMixedSizes());
 }
 
+// InParallelOpInterface implementation
+MutableOperandRange ParallelInsertSliceOp::getUpdatedDestinations() {
+  return getDestMutable();
+}
+
+Operation *ParallelInsertSliceOp::getIteratingParent() {
+  // Return the parent ParallelCombiningOpInterface's parent
+  if (auto combiningOp = dyn_cast<ParallelCombiningOpInterface>(
+          getOperation()->getParentOp())) {
+    return combiningOp->getParentOp();
+  }
+  return nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // ScatterOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index c3356c1e4b9d8..def56687477db 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -974,7 +974,9 @@ struct ParallelInsertSliceOpInterface
         parallelInsertSliceOp.getParallelCombiningParent();
 
     // Bufferize the op outside of the parallel combining terminator.
-    rewriter.setInsertionPoint(parallelCombiningParent);
+    if (parallelCombiningParent) {
+      rewriter.setInsertionPoint(parallelCombiningParent);
+    }
 
     // Get source and destination buffers.
     FailureOr<Value> destBuffer =
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index d76c02af7ab16..0c0380a370d56 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -219,8 +219,10 @@ struct InsertSliceOfInsertSliceFolder : public OpRewritePattern<OpTy> {
     // point outside: only tensor.parallel_insert_slice ops are allowed in
     // there.
     if (std::is_same_v<OpTy, tensor::ParallelInsertSliceOp>) {
-      rewriter.setInsertionPoint(
-          insertSliceOp->template getParentOfType<scf::InParallelOp>());
+      if (auto combiningParent = dyn_cast<ParallelCombiningOpInterface>(
+              insertSliceOp->getParentOp())) {
+        rewriter.setInsertionPoint(insertSliceOp->getParentOp());
+      }  
     }
 
     // Resolve offsets according to source offsets and strides.
diff --git a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
index 2b6703543bbd3..30fcbf0ab3be6 100644
--- a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
+++ b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
@@ -10,18 +10,47 @@
 
 using namespace mlir;
 
+/// Include the definitions of the interface.
+#include "mlir/Interfaces/ParallelCombiningOpInterface.cpp.inc"
+
 //===----------------------------------------------------------------------===//
-// ParallelCombiningOpInterface
+// InParallelOpInterface
 //===----------------------------------------------------------------------===//
 
+// TODO: Catch-22 with interface methods used to verify means methods can't
+// assume the impl is valid.
+LogicalResult mlir::detail::verifyInParallelOpInterface(Operation *op) {
+  auto inParallel = cast<InParallelOpInterface>(op);
+  auto parent = inParallel.getIteratingParent();
+  if (!parent) {
+    return op->emitError(
+        "in_parallel interface op must have an iterating parent");
+  }
+
+  // Simple verification without requiring ParallelIterationOpInterface
+  // Just check that updated destinations are block arguments
+  for (OpOperand &updatedValue : inParallel.getUpdatedDestinations()) {
+    auto bbArg = dyn_cast<BlockArgument>(updatedValue.get());
+    if (!bbArg) {
+      return op->emitError("updating a non block argument");
+    }
+  }
+  return success();
+}
+
+
+//===----------------------------------------------------------------------===//
+// ParallelCombiningOpInterface
+//===----------------------------------------------------------------------===//
 // TODO: Single region single block interface on interfaces ?
 LogicalResult mlir::detail::verifyParallelCombiningOpInterface(Operation *op) {
   if (op->getNumRegions() != 1)
     return op->emitError("expected single region op");
   if (!op->getRegion(0).hasOneBlock())
     return op->emitError("expected single block op region");
+  for (Operation &child : *op->getRegion(0).getBlocks().begin()) {
+    if (!isa<InParallelOpInterface>(&child))
+      return op->emitError("expected only in_parallel interface ops");
+  }
   return success();
 }
-
-/// Include the definitions of the interface.
-#include "mlir/Interfaces/ParallelCombiningOpInterface.cpp.inc"
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index 5f42938244db6..d498f30289fa4 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -915,7 +915,7 @@ func.func @sparse_case(%arg0: tensor<8x8xf32, #CSR>, %arg1: tensor<8xf32>) -> te
 
 // -----
 
-func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
+func.func @parallel_insert_slice() -> tensor<4x2xf32> {
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
@@ -923,6 +923,7 @@ func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
   %res = scf.forall (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
     %1 = tensor.empty() : tensor<1x1xf32>
     %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
+    // CHECK: scf.forall.in_parallel
     scf.forall.in_parallel {
       //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
       // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
@@ -935,6 +936,29 @@ func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
 
 // -----
 
+// CHECK-LABEL: func @parallel_insert_slice_no_terminator
+func.func @parallel_insert_slice_no_terminator() -> tensor<4x2xf32> {
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<4x2xf32>
+  // CHECK: scf.forall
+  %res = scf.forall (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
+    %1 = tensor.empty() : tensor<1x1xf32>
+    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
+    //      CHECK: scf.forall.in_parallel
+    //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
+    // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %2 into %o[%arg0, %arg1] [1, 1] [1, 1] :
+        tensor<1x1xf32> into tensor<4x2xf32>
+    }
+  }
+  return %res: tensor<4x2xf32>
+}
+
+// -----
+
 #map0 = affine_map<(i, j) -> (i, j)>
 #access = [#map0, #map0]
 #trait = {
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index bb7958083e55c..d8455b47f6b1d 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -628,11 +628,9 @@ func.func @invalid_insert_dest(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 
   %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.forall.in_parallel {
-        // expected-error @+1 {{may only insert into an output block argument}}
-        tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
-          tensor<1xf32> into tensor<100xf32>
-      }
+      // expected-error @+1 {{in_parallel interface op must have an iterating parent}}
+      tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
+        tensor<1xf32> into tensor<100xf32>
   }
   return
 }
@@ -645,7 +643,7 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 
   %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
+      // expected-error @+1 {{expected only in_parallel interface ops}}
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
index 9bb87ffbb2090..ed3685514dd0d 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -908,3 +908,111 @@ func.func @parallel_region_no_read()
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @in_order_multiple_parallel_writes
+func.func @in_order_multiple_parallel_writes(%2: tensor<320xf32> {bufferization.writable = true},
+                                            %3: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %7 = tensor.extract_slice %arg2[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+
+    // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+    // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %6 into %arg2[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @out_of_order_parallel_write
+func.func @out_of_order_parallel_write(%2: tensor<320xf32> {bufferization.writable = true},
+                                       %3: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // The extract_slice cannot operate in place because it is used after the
+    // first write.
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+
+    // Additionally the fill aliases the thread local slice.
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
+
+    scf.forall.in_parallel {
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %7 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %6 into %arg2[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @out_of_order_parallel_write
+func.func @out_of_order_parallel_write_multiple_reads(%2: tensor<320xf32> {bufferization.writable = true},
+                                                      %3: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["false", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
+    %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<1xf32>) -> tensor<1xf32>
+
+    %reverse = arith.subi %c320, %arg0 : index
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %8 = tensor.extract_slice %arg1[%reverse] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    scf.forall.in_parallel {
+      // Also cannot operate in place due to subsequent conflicting reads.
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %7 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+      tensor.parallel_insert_slice %8 into %arg2[%reverse] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}
+// -----
+
+// CHECK-LABEL: func @in_order_multiple_parallel_writes
+func.func @in_order_multiple_parallel_writes(%2: tensor<320xf32> {bufferization.writable = true})
+  -> (tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    %reverse = arith.subi %c320, %arg0 : index
+    // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %6 into %arg1[%reverse] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4 : tensor<320xf32>
+} 
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
index 8f4b924cfd3cc..92486b8ed7208 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
@@ -112,7 +112,7 @@ func.func @scf_while_non_equiv_condition_and_body(%A: tensor<5xi1>,
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32>
 // CHECK-FUNC-LABEL: func @scf_forall_out_of_place(
 func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
-                                           %out: tensor<100xf32>) {
+                                   %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
@@ -132,3 +132,31 @@ func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
   } {mapping = [#gpu.thread<x>]}
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @in_order_multiple_parallel_writes
+func.func @in_order_multiple_parallel_writes(%2: tensor<320xf32>,
+                                             %3: tensor<320xf32>)
+  -> (tensor<320xf32>, tensor<320xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant -0.000000e+00 : f32
+  %c320 = arith.constant 320 : index
+  %4:2 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2, %arg2 = %3) -> (tensor<320xf32>, tensor<320xf32>) {
+    // CHECK: tensor.extract_slice {{.*}}
+    %6 = tensor.extract_slice %arg1[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: tensor.extract_slice {{.*}}
+    %7 = tensor.extract_slice %arg2[%arg0] [1] [1] : tensor<320xf32> to tensor<1xf32>
+    // CHECK: linalg.fill {{.*}}
+    %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
+
+    // CHECK: tensor.parallel_insert_slice {{.*}}
+    // CHECK: tensor.parallel_insert_slice {{.*}}
+    scf.forall.in_parallel {
+      tensor.parallel_insert_slice %6 into %arg2[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+      tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
+    }
+  }
+  return %4#0, %4#1 : tensor<320xf32>, tensor<320xf32>
+}

>From 7634face73e4d9db74da12d5811f2cb523cf514d Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 10 Sep 2025 10:44:54 -0400
Subject: [PATCH 02/11] Update according to comments

---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td    |  2 -
 .../Interfaces/ParallelCombiningOpInterface.h |  1 -
 .../ParallelCombiningOpInterface.td           | 33 ++++++++++++++---
 .../TransformOps/LinalgTransformOps.cpp       |  7 +---
 .../Linalg/Transforms/DropUnitDims.cpp        |  8 +---
 mlir/lib/Dialect/SCF/IR/SCF.cpp               | 30 ++++++++++-----
 .../BufferDeallocationOpInterfaceImpl.cpp     |  4 +-
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      |  4 ++
 .../BufferizableOpInterfaceImpl.cpp           |  4 +-
 .../Tensor/Transforms/FoldTensorSubsetOps.cpp |  9 ++---
 .../ParallelCombiningOpInterface.cpp          | 37 ++-----------------
 .../Dialect/Linalg/drop-unit-extent-dims.mlir | 23 ------------
 mlir/test/Dialect/SCF/invalid.mlir            | 10 +++--
 13 files changed, 73 insertions(+), 99 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 88df54174da24..a5ac23dc07c28 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -679,8 +679,6 @@ def InParallelOp : SCF_Op<"forall.in_parallel", [
     OpBuilder<(ins)>,
   ];
 
-  // TODO: Add a `InParallelOpInterface` interface for ops that can
-  // appear inside in_parallel.
   let extraClassDeclaration = [{
     ::llvm::SmallVector<::mlir::BlockArgument> getDests();
     ::llvm::iterator_range<::mlir::Block::iterator> getYieldingOps();
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
index e3441b8322d96..72db06163df37 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
@@ -20,7 +20,6 @@ namespace mlir {
 namespace detail {
 // TODO: Single region single block interface on interfaces ?
 LogicalResult verifyParallelCombiningOpInterface(Operation *op);
-LogicalResult verifyInParallelOpInterface(Operation *op);
 } // namespace detail
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
index 86eaf2c95462c..f677d7c4f650b 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
@@ -58,8 +58,34 @@ def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
 
 def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
   let description = [{
-    An in_parallel op is an operation that inserts into a shared tensor in
-    conjunction with a parent combining and iterating op.
+    An `in_parallel` op is an operation performs parallel updates to
+    destination tensors within the context of a parent iterating operation.
+    
+    This interface is designed for operations that need to coordinate parallel
+    insertions or updates to tensors that are being constructed or modified
+    across multiple parallel iterations. The "updated destination" refers to a
+    destination tensor that accumulates results from parallel computations,
+    where each parallel iteration may contribute a slice, element, or region
+    to the final result.
+
+    One significant use case for this interface is `tensor.parallel_insert_slice`
+    which allows parallel insertion of slices into a destination tensor. But with
+    this interface, other operations that perform similar parallel updates can
+    also be defined.
+
+    The in_parallel operation works within a combining operation (implementing
+    `ParallelCombiningOpInterface`) that specifies how the parallel results are combined.
+
+    Key semantics:
+    - The operation identifies destination tensors that will be updated
+      through the `getUpdatedDestinations` method
+    - Each parallel iteration may update elements or regions of the
+      destination tensor
+    - The parent iterating operation manages the coordination and ensures
+      proper synchronization of these updates
+  
+    Note: This interface does not verify itself, it is up to the implementing operation
+    to verify the correctness of the op.
   }];
   let cppNamespace = "::mlir";
 
@@ -80,9 +106,6 @@ def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
       /*args=*/(ins)
     >,
   ];
-  let verify = [{
-    return ::mlir::detail::verifyInParallelOpInterface($_op);
-  }];
 }
 
 #endif // MLIR_INTERFACES_PARALLELCOMBININGOPINTERFACE
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 840737fdb836b..0cd8c29dca06d 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -4141,14 +4141,11 @@ DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
     return DiagnosedSilenceableFailure::success();
   }
 
-  // If we are inside an ParallelCombiningOp region, temporarily set the
+  // If we are inside a `ParallelCombiningOp` region, temporarily set the
   // insertion point outside: only ops implementing InParallelOpInterface are
   // allowed in there.
   if (isa<mlir::InParallelOpInterface>(target.getOperation())) {
-    if (auto combiningParent =
-            dyn_cast<ParallelCombiningOpInterface>(target->getParentOp())) {
-      rewriter.setInsertionPoint(target->getParentOp());
-    }
+    rewriter.setInsertionPoint(target->getParentOp());
   }
 
   Value extracted = tensor::ExtractSliceOp::create(
diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index 9eea88fb5a837..22690daa4f9e1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -784,12 +784,8 @@ struct RankReducedInsertSliceOp : public OpRewritePattern<InsertOpTy> {
       // The only difference between InsertSliceOp and ParallelInsertSliceOp
       // is the insertion point is just before the ParallelCombiningOp in the
       // parallel case.
-      if (std::is_same<InsertOpTy, tensor::ParallelInsertSliceOp>::value) {
-        if (auto combiningParent = dyn_cast<ParallelCombiningOpInterface>(
-                insertSliceOp->getParentOp())) {
-          rewriter.setInsertionPoint(insertSliceOp->getParentOp());
-        }
-      }
+      if (std::is_same<InsertOpTy, tensor::ParallelInsertSliceOp>::value)
+        rewriter.setInsertionPoint(insertSliceOp->getParentOp());
       reshapedSource = tensor::CollapseShapeOp::create(
           rewriter, loc, insertSliceOp.getSource(), *reassociation);
     }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 873dbbde48b37..d11f9c6879e6d 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -21,6 +21,7 @@
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Interfaces/ParallelCombiningOpInterface.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/MapVector.h"
@@ -1440,7 +1441,6 @@ InParallelOp ForallOp::getTerminator() {
   return cast<InParallelOp>(getBody()->getTerminator());
 }
 
-
 SmallVector<Operation *> ForallOp::getCombiningOps(BlockArgument bbArg) {
   SmallVector<Operation *> storeOps;
   for (Operation *user : bbArg.getUsers()) {
@@ -1674,12 +1674,9 @@ struct ForallOpIterArgsFolder : public OpRewritePattern<ForallOp> {
     for (OpResult result : forallOp.getResults()) {
       OpOperand *opOperand = forallOp.getTiedOpOperand(result);
       BlockArgument blockArg = forallOp.getTiedBlockArgument(opOperand);
-      SmallVector<Operation *> combiningOps =
-          forallOp.getCombiningOps(blockArg);
       if ((result.use_empty() &&
-           llvm::all_of(combiningOps,
-                        [](Operation *op) { return op->use_empty(); })) ||
-          combiningOps.empty()) {
+           llvm::all_of(forallOp.getCombiningOps(blockArg),
+                        [](Operation *op) { return op->use_empty(); }))) {
         resultToDelete.insert(result);
       } else {
         resultToReplace.push_back(result);
@@ -1917,9 +1914,9 @@ struct FoldTensorCastOfOutputIntoForallOp
     auto terminator = newForallOp.getTerminator();
     for (auto [yieldingOp, outputBlockArg] : llvm::zip(
              terminator.getYieldingOps(), newForallOp.getRegionIterArgs())) {
-      auto insertSliceOp = dyn_cast<tensor::ParallelInsertSliceOp>(yieldingOp);
-      if (insertSliceOp)
-        insertSliceOp.getDestMutable().assign(outputBlockArg);
+      auto inParallelOp = dyn_cast<InParallelOpInterface>(yieldingOp);
+      if (inParallelOp)
+        inParallelOp.getUpdatedDestinations().assign(outputBlockArg);
     }
 
     // Cast results back to the original types.
@@ -1978,6 +1975,21 @@ LogicalResult InParallelOp::verify() {
   if (!forallOp)
     return this->emitOpError("expected forall op parent");
 
+  for (Operation &op : getRegion().front().getOperations()) {
+    auto inParallelOp = dyn_cast<InParallelOpInterface>(&op);
+    if (!inParallelOp) {
+      return this->emitOpError("expected only InParallelOpInterface") << " ops";
+    }
+
+    // Verify that inserts are into out block arguments.
+    MutableOperandRange dests = inParallelOp.getUpdatedDestinations();
+    ArrayRef<BlockArgument> regionOutArgs = forallOp.getRegionOutArgs();
+    for (OpOperand &dest : dests) {
+      if (!llvm::is_contained(regionOutArgs, dest.get()))
+        return op.emitOpError("may only insert into an output block argument");
+    }
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
index d70392131df51..09b168e574d42 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
@@ -41,8 +41,8 @@ namespace {
 /// }
 /// ```
 struct InParallelDeallocOpInterface
-    : public BufferDeallocationOpInterface::ExternalModel<InParallelDeallocOpInterface,
-                                                          scf::InParallelOp> {
+    : public BufferDeallocationOpInterface::ExternalModel<
+          InParallelDeallocOpInterface, scf::InParallelOp> {
   FailureOr<Operation *> process(Operation *op, DeallocationState &state,
                                  const DeallocationOptions &options) const {
     auto inParallelOp = cast<scf::InParallelOp>(op);
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 3770690c21a03..ce16dd0a8847e 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3901,6 +3901,10 @@ void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
 }
 
 LogicalResult ParallelInsertSliceOp::verify() {
+  if (!isa<ParallelCombiningOpInterface>(getOperation()->getParentOp()))
+    return this->emitError("expected ParallelCombiningOpInterface parent, got:")
+           << *(getOperation()->getParentOp());
+
   // Verify result type against inferred type.
   RankedTensorType expectedType;
   SliceVerificationResult result =
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index def56687477db..c3356c1e4b9d8 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -974,9 +974,7 @@ struct ParallelInsertSliceOpInterface
         parallelInsertSliceOp.getParallelCombiningParent();
 
     // Bufferize the op outside of the parallel combining terminator.
-    if (parallelCombiningParent) {
-      rewriter.setInsertionPoint(parallelCombiningParent);
-    }
+    rewriter.setInsertionPoint(parallelCombiningParent);
 
     // Get source and destination buffers.
     FailureOr<Value> destBuffer =
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index 0c0380a370d56..be25276207246 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -216,13 +216,10 @@ struct InsertSliceOfInsertSliceFolder : public OpRewritePattern<OpTy> {
                                         droppedDims, resolvedSizes);
 
     // If we are inside an InParallel region, temporarily set the insertion
-    // point outside: only tensor.parallel_insert_slice ops are allowed in
+    // point outside: only ops of InParallelOpInterface are allowed in
     // there.
-    if (std::is_same_v<OpTy, tensor::ParallelInsertSliceOp>) {
-      if (auto combiningParent = dyn_cast<ParallelCombiningOpInterface>(
-              insertSliceOp->getParentOp())) {
-        rewriter.setInsertionPoint(insertSliceOp->getParentOp());
-      }  
+    if (isa<mlir::InParallelOpInterface>(insertSliceOp.getOperation())) {
+      rewriter.setInsertionPoint(insertSliceOp->getParentOp());
     }
 
     // Resolve offsets according to source offsets and strides.
diff --git a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
index 30fcbf0ab3be6..2b6703543bbd3 100644
--- a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
+++ b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
@@ -10,47 +10,18 @@
 
 using namespace mlir;
 
-/// Include the definitions of the interface.
-#include "mlir/Interfaces/ParallelCombiningOpInterface.cpp.inc"
-
-//===----------------------------------------------------------------------===//
-// InParallelOpInterface
-//===----------------------------------------------------------------------===//
-
-// TODO: Catch-22 with interface methods used to verify means methods can't
-// assume the impl is valid.
-LogicalResult mlir::detail::verifyInParallelOpInterface(Operation *op) {
-  auto inParallel = cast<InParallelOpInterface>(op);
-  auto parent = inParallel.getIteratingParent();
-  if (!parent) {
-    return op->emitError(
-        "in_parallel interface op must have an iterating parent");
-  }
-
-  // Simple verification without requiring ParallelIterationOpInterface
-  // Just check that updated destinations are block arguments
-  for (OpOperand &updatedValue : inParallel.getUpdatedDestinations()) {
-    auto bbArg = dyn_cast<BlockArgument>(updatedValue.get());
-    if (!bbArg) {
-      return op->emitError("updating a non block argument");
-    }
-  }
-  return success();
-}
-
-
 //===----------------------------------------------------------------------===//
 // ParallelCombiningOpInterface
 //===----------------------------------------------------------------------===//
+
 // TODO: Single region single block interface on interfaces ?
 LogicalResult mlir::detail::verifyParallelCombiningOpInterface(Operation *op) {
   if (op->getNumRegions() != 1)
     return op->emitError("expected single region op");
   if (!op->getRegion(0).hasOneBlock())
     return op->emitError("expected single block op region");
-  for (Operation &child : *op->getRegion(0).getBlocks().begin()) {
-    if (!isa<InParallelOpInterface>(&child))
-      return op->emitError("expected only in_parallel interface ops");
-  }
   return success();
 }
+
+/// Include the definitions of the interface.
+#include "mlir/Interfaces/ParallelCombiningOpInterface.cpp.inc"
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index d498f30289fa4..9005110205630 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -936,29 +936,6 @@ func.func @parallel_insert_slice() -> tensor<4x2xf32> {
 
 // -----
 
-// CHECK-LABEL: func @parallel_insert_slice_no_terminator
-func.func @parallel_insert_slice_no_terminator() -> tensor<4x2xf32> {
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.empty() : tensor<4x2xf32>
-  // CHECK: scf.forall
-  %res = scf.forall (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
-    %1 = tensor.empty() : tensor<1x1xf32>
-    %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
-    //      CHECK: scf.forall.in_parallel
-    //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
-    // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
-    scf.forall.in_parallel {
-      tensor.parallel_insert_slice %2 into %o[%arg0, %arg1] [1, 1] [1, 1] :
-        tensor<1x1xf32> into tensor<4x2xf32>
-    }
-  }
-  return %res: tensor<4x2xf32>
-}
-
-// -----
-
 #map0 = affine_map<(i, j) -> (i, j)>
 #access = [#map0, #map0]
 #trait = {
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index d8455b47f6b1d..bb7958083e55c 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -628,9 +628,11 @@ func.func @invalid_insert_dest(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 
   %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      // expected-error @+1 {{in_parallel interface op must have an iterating parent}}
-      tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
-        tensor<1xf32> into tensor<100xf32>
+      scf.forall.in_parallel {
+        // expected-error @+1 {{may only insert into an output block argument}}
+        tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
+          tensor<1xf32> into tensor<100xf32>
+      }
   }
   return
 }
@@ -643,7 +645,7 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 
   %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      // expected-error @+1 {{expected only in_parallel interface ops}}
+      // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>

>From a9ce4c36dfbc66ed75685ee0231f7b4f3bb5bca0 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 10 Sep 2025 11:54:43 -0400
Subject: [PATCH 03/11] Swap the name of the interface

---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td       |  2 +-
 mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td | 16 ++++++++--------
 .../Interfaces/ParallelCombiningOpInterface.h    |  2 +-
 .../Interfaces/ParallelCombiningOpInterface.td   |  6 +++---
 .../Linalg/TransformOps/LinalgTransformOps.cpp   |  6 +++---
 mlir/lib/Dialect/SCF/IR/SCF.cpp                  | 10 +++++-----
 .../BufferDeallocationOpInterfaceImpl.cpp        |  2 +-
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp         | 14 +++++++-------
 .../Transforms/BufferizableOpInterfaceImpl.cpp   |  2 +-
 .../Tensor/Transforms/FoldTensorSubsetOps.cpp    |  4 ++--
 .../Interfaces/ParallelCombiningOpInterface.cpp  |  4 ++--
 11 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index a5ac23dc07c28..d3c01c31636a7 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -654,7 +654,7 @@ def ForallOp : SCF_Op<"forall", [
 def InParallelOp : SCF_Op<"forall.in_parallel", [
        Pure,
        Terminator,
-       DeclareOpInterfaceMethods<ParallelCombiningOpInterface>,
+       DeclareOpInterfaceMethods<InParallelOpInterface>,
        HasParent<"ForallOp">,
       ] # GraphRegionNoTerminator.traits> {
   let summary = "terminates a `forall` block";
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 842a76e8fe90f..cd134fdfd094c 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1470,26 +1470,26 @@ def Tensor_PadOp : Tensor_Op<"pad", [
 // ParallelInsertSliceOp
 //===----------------------------------------------------------------------===//
 
-// TODO: Implement InParallelOpInterface.
+// TODO: Implement ParallelCombiningOpInterface.
 def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
        AttrSizedOperandSegments,
        OffsetSizeAndStrideOpInterface,
-       DeclareOpInterfaceMethods<InParallelOpInterface,
+       DeclareOpInterfaceMethods<ParallelCombiningOpInterface,
           ["getUpdatedDestinations", "getIteratingParent"]>,
        // TODO: Cannot use an interface here atm, verify this manually for now.
-       // HasParent<"ParallelCombiningOpInterface">
+       // HasParent<"InParallelOpInterface">
   ]> {
   let summary = [{
     Specify the tensor slice update of a single thread of a parent
-    ParallelCombiningOpInterface op.
+    InParallelOpInterface op.
   }];
   let description = [{
     The `parallel_insert_slice` yields a subset tensor value to its parent
-    ParallelCombiningOpInterface. These subset tensor values are aggregated to
+    InParallelOpInterface. These subset tensor values are aggregated to
     in some unspecified order into a full tensor value returned by the parent
     parallel iterating op.
     The `parallel_insert_slice` is one such op allowed in the
-    ParallelCombiningOpInterface op.
+    InParallelOpInterface op.
 
     Conflicting writes result in undefined semantics, in that the indices written
     to by multiple parallel updates might contain data from any of the updates,
@@ -1571,8 +1571,8 @@ def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
       return ::llvm::cast<RankedTensorType>(getDest().getType());
     }
 
-    ParallelCombiningOpInterface getParallelCombiningParent() {
-      return dyn_cast<ParallelCombiningOpInterface>(
+    InParallelOpInterface getParallelCombiningParent() {
+      return dyn_cast<InParallelOpInterface>(
         getOperation()->getParentOp());
     }
 
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
index 72db06163df37..82ab427699f64 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.h
@@ -19,7 +19,7 @@
 namespace mlir {
 namespace detail {
 // TODO: Single region single block interface on interfaces ?
-LogicalResult verifyParallelCombiningOpInterface(Operation *op);
+LogicalResult verifyInParallelOpInterface(Operation *op);
 } // namespace detail
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
index f677d7c4f650b..acd1b1065af32 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
@@ -15,7 +15,7 @@
 
 include "mlir/IR/OpBase.td"
 
-def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
+def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
   let description = [{
     A parallel combining op is an op with a region.
 
@@ -52,11 +52,11 @@ def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   ];
   // TODO: Single region single block interface on interfaces ?
   let verify = [{
-    return verifyParallelCombiningOpInterface($_op);
+    return verifyInParallelOpInterface($_op);
   }];
 }
 
-def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
+def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   let description = [{
     An `in_parallel` op is an operation performs parallel updates to
     destination tensors within the context of a parent iterating operation.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 0cd8c29dca06d..a2ec1e9eaebde 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -4141,10 +4141,10 @@ DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
     return DiagnosedSilenceableFailure::success();
   }
 
-  // If we are inside a `ParallelCombiningOp` region, temporarily set the
-  // insertion point outside: only ops implementing InParallelOpInterface are
+  // If we are inside a `InParallelOp` region, temporarily set the
+  // insertion point outside: only ops implementing ParallelCombiningOpInterface are
   // allowed in there.
-  if (isa<mlir::InParallelOpInterface>(target.getOperation())) {
+  if (isa<mlir::ParallelCombiningOpInterface>(target.getOperation())) {
     rewriter.setInsertionPoint(target->getParentOp());
   }
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index d11f9c6879e6d..8c68488ab59f8 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1444,7 +1444,7 @@ InParallelOp ForallOp::getTerminator() {
 SmallVector<Operation *> ForallOp::getCombiningOps(BlockArgument bbArg) {
   SmallVector<Operation *> storeOps;
   for (Operation *user : bbArg.getUsers()) {
-    if (auto parallelOp = dyn_cast<InParallelOpInterface>(user)) {
+    if (auto parallelOp = dyn_cast<ParallelCombiningOpInterface>(user)) {
       storeOps.push_back(parallelOp);
     }
   }
@@ -1914,7 +1914,7 @@ struct FoldTensorCastOfOutputIntoForallOp
     auto terminator = newForallOp.getTerminator();
     for (auto [yieldingOp, outputBlockArg] : llvm::zip(
              terminator.getYieldingOps(), newForallOp.getRegionIterArgs())) {
-      auto inParallelOp = dyn_cast<InParallelOpInterface>(yieldingOp);
+      auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(yieldingOp);
       if (inParallelOp)
         inParallelOp.getUpdatedDestinations().assign(outputBlockArg);
     }
@@ -1976,9 +1976,9 @@ LogicalResult InParallelOp::verify() {
     return this->emitOpError("expected forall op parent");
 
   for (Operation &op : getRegion().front().getOperations()) {
-    auto inParallelOp = dyn_cast<InParallelOpInterface>(&op);
+    auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(&op);
     if (!inParallelOp) {
-      return this->emitOpError("expected only InParallelOpInterface") << " ops";
+      return this->emitOpError("expected only ParallelCombiningOpInterface") << " ops";
     }
 
     // Verify that inserts are into out block arguments.
@@ -2026,7 +2026,7 @@ OpResult InParallelOp::getParentResult(int64_t idx) {
 SmallVector<BlockArgument> InParallelOp::getDests() {
   SmallVector<BlockArgument> updatedDests;
   for (auto &yieldingOp : getYieldingOps()) {
-    auto inParallelOp = dyn_cast<InParallelOpInterface>(&yieldingOp);
+    auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(&yieldingOp);
     if (!inParallelOp)
       continue;
     for (auto &updatedOperand : inParallelOp.getUpdatedDestinations())
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
index 09b168e574d42..63216e7cc7fba 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferDeallocationOpInterfaceImpl.cpp
@@ -16,7 +16,7 @@ using namespace mlir::bufferization;
 namespace {
 /// The `scf.forall.in_parallel` terminator is special in a few ways:
 /// * It does not implement the BranchOpInterface or
-///   RegionBranchTerminatorOpInterface, but the ParallelCombiningOpInterface
+///   RegionBranchTerminatorOpInterface, but the InParallelOpInterface
 ///   which is not supported by BufferDeallocation.
 /// * It has a graph-like region which only allows one specific tensor op
 /// * After bufferization the nested region is always empty
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index ce16dd0a8847e..afb660a7ce850 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2978,7 +2978,7 @@ class InsertSliceOpConstantArgumentFolder final
       // The only difference between InsertSliceOp and ParallelInsertSliceOp
       // is that the insertion point is just before the ParallelCombiningOp in
       // the parallel case.
-      if (isa<ParallelCombiningOpInterface>(insertSliceOp->getParentOp()))
+      if (isa<InParallelOpInterface>(insertSliceOp->getParentOp()))
         rewriter.setInsertionPoint(insertSliceOp->getParentOp());
       toInsert = tensor::CastOp::create(rewriter, insertSliceOp.getLoc(),
                                         sourceType, toInsert);
@@ -3846,7 +3846,7 @@ OpFoldResult PadOp::fold(FoldAdaptor) {
 //===----------------------------------------------------------------------===//
 
 OpResult ParallelInsertSliceOp::getTiedOpResult() {
-  ParallelCombiningOpInterface parallelCombiningParent =
+  InParallelOpInterface parallelCombiningParent =
       getParallelCombiningParent();
   for (const auto &it :
        llvm::enumerate(parallelCombiningParent.getYieldingOps())) {
@@ -3901,8 +3901,8 @@ void ParallelInsertSliceOp::build(OpBuilder &b, OperationState &result,
 }
 
 LogicalResult ParallelInsertSliceOp::verify() {
-  if (!isa<ParallelCombiningOpInterface>(getOperation()->getParentOp()))
-    return this->emitError("expected ParallelCombiningOpInterface parent, got:")
+  if (!isa<InParallelOpInterface>(getOperation()->getParentOp()))
+    return this->emitError("expected InParallelOpInterface parent, got:")
            << *(getOperation()->getParentOp());
 
   // Verify result type against inferred type.
@@ -3935,14 +3935,14 @@ llvm::SmallBitVector ParallelInsertSliceOp::getDroppedDims() {
   return ::getDroppedDims(getSourceType().getShape(), getMixedSizes());
 }
 
-// InParallelOpInterface implementation
+// ParallelCombiningOpInterface implementation
 MutableOperandRange ParallelInsertSliceOp::getUpdatedDestinations() {
   return getDestMutable();
 }
 
 Operation *ParallelInsertSliceOp::getIteratingParent() {
-  // Return the parent ParallelCombiningOpInterface's parent
-  if (auto combiningOp = dyn_cast<ParallelCombiningOpInterface>(
+  // Return the parent InParallelOpInterface's parent
+  if (auto combiningOp = dyn_cast<InParallelOpInterface>(
           getOperation()->getParentOp())) {
     return combiningOp->getParentOp();
   }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index c3356c1e4b9d8..386193ec79a52 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -970,7 +970,7 @@ struct ParallelInsertSliceOpInterface
                           BufferizationState &state) const {
     OpBuilder::InsertionGuard g(rewriter);
     auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
-    ParallelCombiningOpInterface parallelCombiningParent =
+    InParallelOpInterface parallelCombiningParent =
         parallelInsertSliceOp.getParallelCombiningParent();
 
     // Bufferize the op outside of the parallel combining terminator.
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index be25276207246..3284ed9c6fa06 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -216,9 +216,9 @@ struct InsertSliceOfInsertSliceFolder : public OpRewritePattern<OpTy> {
                                         droppedDims, resolvedSizes);
 
     // If we are inside an InParallel region, temporarily set the insertion
-    // point outside: only ops of InParallelOpInterface are allowed in
+    // point outside: only ops of ParallelCombiningOpInterface are allowed in
     // there.
-    if (isa<mlir::InParallelOpInterface>(insertSliceOp.getOperation())) {
+    if (isa<mlir::ParallelCombiningOpInterface>(insertSliceOp.getOperation())) {
       rewriter.setInsertionPoint(insertSliceOp->getParentOp());
     }
 
diff --git a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
index 2b6703543bbd3..30b8191bf34b0 100644
--- a/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
+++ b/mlir/lib/Interfaces/ParallelCombiningOpInterface.cpp
@@ -11,11 +11,11 @@
 using namespace mlir;
 
 //===----------------------------------------------------------------------===//
-// ParallelCombiningOpInterface
+// InParallelOpInterface (formerly ParallelCombiningOpInterface)
 //===----------------------------------------------------------------------===//
 
 // TODO: Single region single block interface on interfaces ?
-LogicalResult mlir::detail::verifyParallelCombiningOpInterface(Operation *op) {
+LogicalResult mlir::detail::verifyInParallelOpInterface(Operation *op) {
   if (op->getNumRegions() != 1)
     return op->emitError("expected single region op");
   if (!op->getRegion(0).hasOneBlock())

>From 0958eaf308f32b4404463b54b3b038a7078fe1e3 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 10 Sep 2025 12:20:30 -0400
Subject: [PATCH 04/11] Update comments

---
 .../mlir/Interfaces/ParallelCombiningOpInterface.td   | 11 ++++++-----
 .../Linalg/TransformOps/LinalgTransformOps.cpp        |  2 +-
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp              |  4 ++--
 .../Tensor/Transforms/BufferizableOpInterfaceImpl.cpp |  2 +-
 .../Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp |  2 +-
 5 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
index acd1b1065af32..c83fd250d6ef6 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Defines the interface for ops that perform parallel combining operations.
+// Defines the interface for ops that perform in parallel combining
+// operations.
 //
 //===----------------------------------------------------------------------===//
 
@@ -17,7 +18,7 @@ include "mlir/IR/OpBase.td"
 
 def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
   let description = [{
-    A parallel combining op is an op with a region.
+    An in parallel op is an op with a region.
 
     This is useful as a terminator to parallel operations that iterate over
     some set and return tensors while avoiding tight coupling between the
@@ -58,7 +59,7 @@ def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
 
 def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   let description = [{
-    An `in_parallel` op is an operation performs parallel updates to
+    A parallel combining op is an operation performs parallel updates to
     destination tensors within the context of a parent iterating operation.
     
     This interface is designed for operations that need to coordinate parallel
@@ -73,8 +74,8 @@ def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
     this interface, other operations that perform similar parallel updates can
     also be defined.
 
-    The in_parallel operation works within a combining operation (implementing
-    `ParallelCombiningOpInterface`) that specifies how the parallel results are combined.
+    This op works within an op  implementing the
+    `InParallelOpInterface` that specifies how the parallel results are combined.
 
     Key semantics:
     - The operation identifies destination tensors that will be updated
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index a2ec1e9eaebde..9cc701c33a686 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -4141,7 +4141,7 @@ DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
     return DiagnosedSilenceableFailure::success();
   }
 
-  // If we are inside a `InParallelOp` region, temporarily set the
+  // If we are inside a `ParallelCombiningOp` region, temporarily set the
   // insertion point outside: only ops implementing ParallelCombiningOpInterface are
   // allowed in there.
   if (isa<mlir::ParallelCombiningOpInterface>(target.getOperation())) {
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index afb660a7ce850..68a229b9058fc 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -2976,7 +2976,7 @@ class InsertSliceOpConstantArgumentFolder final
     if (sourceType != insertSliceOp.getSourceType()) {
       OpBuilder::InsertionGuard g(rewriter);
       // The only difference between InsertSliceOp and ParallelInsertSliceOp
-      // is that the insertion point is just before the ParallelCombiningOp in
+      // is that the insertion point is just before the InParallelOp in
       // the parallel case.
       if (isa<InParallelOpInterface>(insertSliceOp->getParentOp()))
         rewriter.setInsertionPoint(insertSliceOp->getParentOp());
@@ -3153,7 +3153,7 @@ struct InsertSliceOpSourceCastInserter final
     // Insert the cast.
     OpBuilder::InsertionGuard g(rewriter);
     // The only difference between InsertSliceOp and ParallelInsertSliceOp is
-    // that the insertion point is just before the ParallelCombiningOp in the
+    // that the insertion point is just before the InParallelOp in the
     // parallel case.
     if (isa<ParallelCombiningOpInterface>(insertSliceOp->getParentOp()))
       rewriter.setInsertionPoint(insertSliceOp->getParentOp());
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 386193ec79a52..bce964e47a3be 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -973,7 +973,7 @@ struct ParallelInsertSliceOpInterface
     InParallelOpInterface parallelCombiningParent =
         parallelInsertSliceOp.getParallelCombiningParent();
 
-    // Bufferize the op outside of the parallel combining terminator.
+    // Bufferize the op outside of the in parallel terminator.
     rewriter.setInsertionPoint(parallelCombiningParent);
 
     // Get source and destination buffers.
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index 3284ed9c6fa06..43d288342bcce 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -215,7 +215,7 @@ struct InsertSliceOfInsertSliceFolder : public OpRewritePattern<OpTy> {
                                         sourceInsertSliceOp.getMixedSizes(),
                                         droppedDims, resolvedSizes);
 
-    // If we are inside an InParallel region, temporarily set the insertion
+    // If we are inside a ParallelCombining region, temporarily set the insertion
     // point outside: only ops of ParallelCombiningOpInterface are allowed in
     // there.
     if (isa<mlir::ParallelCombiningOpInterface>(insertSliceOp.getOperation())) {

>From f171aa0c49e5d5b059497eb2ff8c0ba6f26bc050 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 10 Sep 2025 12:35:21 -0400
Subject: [PATCH 05/11] Fix test.

---
 mlir/test/Dialect/SCF/invalid.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index bb7958083e55c..37fc86b18e7f0 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -645,7 +645,7 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 
   %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
+      // expected-error @+1 {{expected only ParallelCombiningOpInterface ops}}
       scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>

>From 5e9237076da458541d33605ef2d9a89e68a90130 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 10 Sep 2025 13:26:47 -0400
Subject: [PATCH 06/11] Antoher update

---
 mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td | 1 -
 mlir/lib/Dialect/SCF/IR/SCF.cpp                  | 7 +++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index cd134fdfd094c..2453cf5b5b5a4 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1470,7 +1470,6 @@ def Tensor_PadOp : Tensor_Op<"pad", [
 // ParallelInsertSliceOp
 //===----------------------------------------------------------------------===//
 
-// TODO: Implement ParallelCombiningOpInterface.
 def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
        AttrSizedOperandSegments,
        OffsetSizeAndStrideOpInterface,
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 8c68488ab59f8..e69182a564229 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1674,9 +1674,12 @@ struct ForallOpIterArgsFolder : public OpRewritePattern<ForallOp> {
     for (OpResult result : forallOp.getResults()) {
       OpOperand *opOperand = forallOp.getTiedOpOperand(result);
       BlockArgument blockArg = forallOp.getTiedBlockArgument(opOperand);
+      SmallVector<Operation *> combiningOps =
+          forallOp.getCombiningOps(blockArg);
       if ((result.use_empty() &&
-           llvm::all_of(forallOp.getCombiningOps(blockArg),
-                        [](Operation *op) { return op->use_empty(); }))) {
+           llvm::all_of(combiningOps,
+                        [](Operation *op) { return op->use_empty(); })) ||
+          combiningOps.empty()) {
         resultToDelete.insert(result);
       } else {
         resultToReplace.push_back(result);

>From feb5939cc33e0821432c5620d01e4963eaf9df05 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Wed, 10 Sep 2025 14:38:28 -0400
Subject: [PATCH 07/11] Address comments.

---
 .../ParallelCombiningOpInterface.td           | 38 +++++++++----------
 .../TransformOps/LinalgTransformOps.cpp       |  7 ++--
 mlir/lib/Dialect/SCF/IR/SCF.cpp               | 14 +++----
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      |  7 ++--
 .../Tensor/Transforms/FoldTensorSubsetOps.cpp |  6 +--
 5 files changed, 33 insertions(+), 39 deletions(-)

diff --git a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
index c83fd250d6ef6..ace26f723ef53 100644
--- a/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
+++ b/mlir/include/mlir/Interfaces/ParallelCombiningOpInterface.td
@@ -59,31 +59,31 @@ def InParallelOpInterface : OpInterface<"InParallelOpInterface"> {
 
 def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   let description = [{
-    A parallel combining op is an operation performs parallel updates to
-    destination tensors within the context of a parent iterating operation.
+    A parallel combining op is an operation that models parallel contributions
+    to result tensors within the context of a parent iterating operation.
     
     This interface is designed for operations that need to coordinate parallel
-    insertions or updates to tensors that are being constructed or modified
-    across multiple parallel iterations. The "updated destination" refers to a
-    destination tensor that accumulates results from parallel computations,
-    where each parallel iteration may contribute a slice, element, or region
-    to the final result.
+    insertions or contributions to tensors that are being constructed across
+    multiple parallel iterations. The destination refers to a tensor value that
+    is assembled by aggregating results from parallel computations; each
+    parallel iteration may contribute a slice, element, or region to the final
+    result. No in-place mutation of tensors is implied.
 
     One significant use case for this interface is `tensor.parallel_insert_slice`
-    which allows parallel insertion of slices into a destination tensor. But with
-    this interface, other operations that perform similar parallel updates can
-    also be defined.
+    which allows parallel insertion of slices that are aggregated into a
+    destination tensor. With this interface, other operations that express
+    similar parallel contributions can also be defined.
 
-    This op works within an op  implementing the
-    `InParallelOpInterface` that specifies how the parallel results are combined.
+    This op works within an op implementing the `InParallelOpInterface` that
+    specifies how the parallel results are combined.
 
     Key semantics:
-    - The operation identifies destination tensors that will be updated
-      through the `getUpdatedDestinations` method
-    - Each parallel iteration may update elements or regions of the
-      destination tensor
+    - The operation identifies destination tensors to which iterations
+      contribute through the `getUpdatedDestinations` method
+    - Each parallel iteration may produce elements or regions that are
+      incorporated into the destination tensor
     - The parent iterating operation manages the coordination and ensures
-      proper synchronization of these updates
+      proper synchronization of these contributions
   
     Note: This interface does not verify itself, it is up to the implementing operation
     to verify the correctness of the op.
@@ -91,8 +91,8 @@ def ParallelCombiningOpInterface : OpInterface<"ParallelCombiningOpInterface"> {
   let cppNamespace = "::mlir";
 
   let methods = [
-    InterfaceMethod<[{
-        Returns the list of values updated by this op.
+    InterfaceMethod<[{ 
+        Returns the list of destination values this op contributes to.
       }],
       /*retTy=*/"::mlir::MutableOperandRange",
       /*methodName=*/"getUpdatedDestinations",
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 9cc701c33a686..7f8d45c237765 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -4142,11 +4142,10 @@ DiagnosedSilenceableFailure doit(RewriterBase &rewriter, OpTy target,
   }
 
   // If we are inside a `ParallelCombiningOp` region, temporarily set the
-  // insertion point outside: only ops implementing ParallelCombiningOpInterface are
-  // allowed in there.
-  if (isa<mlir::ParallelCombiningOpInterface>(target.getOperation())) {
+  // insertion point outside: only ops implementing ParallelCombiningOpInterface
+  // are allowed in there.
+  if (isa<mlir::ParallelCombiningOpInterface>(target.getOperation()))
     rewriter.setInsertionPoint(target->getParentOp());
-  }
 
   Value extracted = tensor::ExtractSliceOp::create(
       rewriter, target.getLoc(), target.getDest(), target.getMixedOffsets(),
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index e69182a564229..ffb3c899b08d0 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1674,12 +1674,7 @@ struct ForallOpIterArgsFolder : public OpRewritePattern<ForallOp> {
     for (OpResult result : forallOp.getResults()) {
       OpOperand *opOperand = forallOp.getTiedOpOperand(result);
       BlockArgument blockArg = forallOp.getTiedBlockArgument(opOperand);
-      SmallVector<Operation *> combiningOps =
-          forallOp.getCombiningOps(blockArg);
-      if ((result.use_empty() &&
-           llvm::all_of(combiningOps,
-                        [](Operation *op) { return op->use_empty(); })) ||
-          combiningOps.empty()) {
+      if (result.use_empty() || forallOp.getCombiningOps(blockArg).empty()) {
         resultToDelete.insert(result);
       } else {
         resultToReplace.push_back(result);
@@ -1981,7 +1976,8 @@ LogicalResult InParallelOp::verify() {
   for (Operation &op : getRegion().front().getOperations()) {
     auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(&op);
     if (!inParallelOp) {
-      return this->emitOpError("expected only ParallelCombiningOpInterface") << " ops";
+      return this->emitOpError("expected only ParallelCombiningOpInterface")
+             << " ops";
     }
 
     // Verify that inserts are into out block arguments.
@@ -2028,11 +2024,11 @@ OpResult InParallelOp::getParentResult(int64_t idx) {
 
 SmallVector<BlockArgument> InParallelOp::getDests() {
   SmallVector<BlockArgument> updatedDests;
-  for (auto &yieldingOp : getYieldingOps()) {
+  for (Operation &yieldingOp : getYieldingOps()) {
     auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(&yieldingOp);
     if (!inParallelOp)
       continue;
-    for (auto &updatedOperand : inParallelOp.getUpdatedDestinations())
+    for (OpOperand &updatedOperand : inParallelOp.getUpdatedDestinations())
       updatedDests.push_back(cast<BlockArgument>(updatedOperand.get()));
   }
   return updatedDests;
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 68a229b9058fc..0ab8400a80a20 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3846,8 +3846,7 @@ OpFoldResult PadOp::fold(FoldAdaptor) {
 //===----------------------------------------------------------------------===//
 
 OpResult ParallelInsertSliceOp::getTiedOpResult() {
-  InParallelOpInterface parallelCombiningParent =
-      getParallelCombiningParent();
+  InParallelOpInterface parallelCombiningParent = getParallelCombiningParent();
   for (const auto &it :
        llvm::enumerate(parallelCombiningParent.getYieldingOps())) {
     Operation &nextOp = it.value();
@@ -3942,8 +3941,8 @@ MutableOperandRange ParallelInsertSliceOp::getUpdatedDestinations() {
 
 Operation *ParallelInsertSliceOp::getIteratingParent() {
   // Return the parent InParallelOpInterface's parent
-  if (auto combiningOp = dyn_cast<InParallelOpInterface>(
-          getOperation()->getParentOp())) {
+  if (auto combiningOp =
+          dyn_cast<InParallelOpInterface>(getOperation()->getParentOp())) {
     return combiningOp->getParentOp();
   }
   return nullptr;
diff --git a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
index 43d288342bcce..b32faf481af80 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/FoldTensorSubsetOps.cpp
@@ -215,9 +215,9 @@ struct InsertSliceOfInsertSliceFolder : public OpRewritePattern<OpTy> {
                                         sourceInsertSliceOp.getMixedSizes(),
                                         droppedDims, resolvedSizes);
 
-    // If we are inside a ParallelCombining region, temporarily set the insertion
-    // point outside: only ops of ParallelCombiningOpInterface are allowed in
-    // there.
+    // If we are inside a ParallelCombining region, temporarily set the
+    // insertion point outside: only ops of ParallelCombiningOpInterface are
+    // allowed in there.
     if (isa<mlir::ParallelCombiningOpInterface>(insertSliceOp.getOperation())) {
       rewriter.setInsertionPoint(insertSliceOp->getParentOp());
     }

>From 4d343c52bcd52531e97824991d341e178a75c7bf Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Thu, 11 Sep 2025 07:35:53 -0400
Subject: [PATCH 08/11] update

---
 mlir/lib/Dialect/SCF/IR/SCF.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index ffb3c899b08d0..21da8cb8a27a5 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -681,7 +681,6 @@ void mlir::scf::promote(RewriterBase &rewriter, scf::ForallOp forallOp) {
   SmallVector<Value> results;
   results.reserve(forallOp.getResults().size());
   for (auto &yieldingOp : terminator.getYieldingOps()) {
-    // Skip non-ParallelInsertSliceOp operations
     auto parallelInsertSliceOp =
         dyn_cast<tensor::ParallelInsertSliceOp>(yieldingOp);
     if (!parallelInsertSliceOp)
@@ -1912,9 +1911,10 @@ struct FoldTensorCastOfOutputIntoForallOp
     auto terminator = newForallOp.getTerminator();
     for (auto [yieldingOp, outputBlockArg] : llvm::zip(
              terminator.getYieldingOps(), newForallOp.getRegionIterArgs())) {
-      auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(yieldingOp);
-      if (inParallelOp)
+      if (auto inParallelOp =
+              dyn_cast<ParallelCombiningOpInterface>(yieldingOp)) {
         inParallelOp.getUpdatedDestinations().assign(outputBlockArg);
+      }
     }
 
     // Cast results back to the original types.

>From bb6ba5823f0464340c859c99b76d1ca94ff6e1ac Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Thu, 11 Sep 2025 11:49:55 -0400
Subject: [PATCH 09/11] Fix bazel

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index cd32e98dac693..2288ba9eacc47 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10847,6 +10847,7 @@ cc_library(
         ":LinalgTransformOpsIncGen",
         ":LinalgTransforms",
         ":LinalgUtils",
+        ":ParallelCombiningOpInterface", 
         ":SCFDialect",
         ":SCFTransforms",
         ":Support",

>From cce652c972522108cbbb5fe405531135e785055a Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 12 Sep 2025 13:48:06 -0700
Subject: [PATCH 10/11] Fix according to comments.

---
 mlir/lib/Dialect/SCF/IR/SCF.cpp          | 13 +++++++------
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 21da8cb8a27a5..30a3a95eeb274 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1911,9 +1911,9 @@ struct FoldTensorCastOfOutputIntoForallOp
     auto terminator = newForallOp.getTerminator();
     for (auto [yieldingOp, outputBlockArg] : llvm::zip(
              terminator.getYieldingOps(), newForallOp.getRegionIterArgs())) {
-      if (auto inParallelOp =
+      if (auto parallelCombingingOp =
               dyn_cast<ParallelCombiningOpInterface>(yieldingOp)) {
-        inParallelOp.getUpdatedDestinations().assign(outputBlockArg);
+        parallelCombingingOp.getUpdatedDestinations().assign(outputBlockArg);
       }
     }
 
@@ -1974,8 +1974,8 @@ LogicalResult InParallelOp::verify() {
     return this->emitOpError("expected forall op parent");
 
   for (Operation &op : getRegion().front().getOperations()) {
-    auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(&op);
-    if (!inParallelOp) {
+    auto parallelCombiningOp = dyn_cast<ParallelCombiningOpInterface>(&op);
+    if (!parallelCombiningOp) {
       return this->emitOpError("expected only ParallelCombiningOpInterface")
              << " ops";
     }
@@ -2025,8 +2025,9 @@ OpResult InParallelOp::getParentResult(int64_t idx) {
 SmallVector<BlockArgument> InParallelOp::getDests() {
   SmallVector<BlockArgument> updatedDests;
   for (Operation &yieldingOp : getYieldingOps()) {
-    auto inParallelOp = dyn_cast<ParallelCombiningOpInterface>(&yieldingOp);
-    if (!inParallelOp)
+    auto parallelCombiningOp =
+        dyn_cast<ParallelCombiningOpInterface>(&yieldingOp);
+    if (!parallelCombingingOp)
       continue;
     for (OpOperand &updatedOperand : inParallelOp.getUpdatedDestinations())
       updatedDests.push_back(cast<BlockArgument>(updatedOperand.get()));
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 0ab8400a80a20..5f63ebc8a998a 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3934,13 +3934,13 @@ llvm::SmallBitVector ParallelInsertSliceOp::getDroppedDims() {
   return ::getDroppedDims(getSourceType().getShape(), getMixedSizes());
 }
 
-// ParallelCombiningOpInterface implementation
+// ParallelCombiningOpInterface implementation.
 MutableOperandRange ParallelInsertSliceOp::getUpdatedDestinations() {
   return getDestMutable();
 }
 
 Operation *ParallelInsertSliceOp::getIteratingParent() {
-  // Return the parent InParallelOpInterface's parent
+  // Return the parent InParallelOpInterface's parent.
   if (auto combiningOp =
           dyn_cast<InParallelOpInterface>(getOperation()->getParentOp())) {
     return combiningOp->getParentOp();

>From 4eaa74674e545235eefccc46552d0c89418f6c21 Mon Sep 17 00:00:00 2001
From: Alan Li <me at alanli.org>
Date: Fri, 12 Sep 2025 13:54:14 -0700
Subject: [PATCH 11/11] Fix buildifier linting issue

---
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp          | 3 +--
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 5f63ebc8a998a..fa97b49a41d97 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3942,9 +3942,8 @@ MutableOperandRange ParallelInsertSliceOp::getUpdatedDestinations() {
 Operation *ParallelInsertSliceOp::getIteratingParent() {
   // Return the parent InParallelOpInterface's parent.
   if (auto combiningOp =
-          dyn_cast<InParallelOpInterface>(getOperation()->getParentOp())) {
+          dyn_cast<InParallelOpInterface>(getOperation()->getParentOp()))
     return combiningOp->getParentOp();
-  }
   return nullptr;
 }
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2288ba9eacc47..540c8b85ecaa4 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10847,7 +10847,7 @@ cc_library(
         ":LinalgTransformOpsIncGen",
         ":LinalgTransforms",
         ":LinalgUtils",
-        ":ParallelCombiningOpInterface", 
+        ":ParallelCombiningOpInterface",
         ":SCFDialect",
         ":SCFTransforms",
         ":Support",