[Mlir-commits] [mlir] [MLIR][Vector] Add warp distribution for `scf.if` (PR #157119)

Wed Sep 10 12:46:31 PDT 2025

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/157119

>From 4b202defcaa9dac128041736afa8b0c9481c3bfb Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Fri, 5 Sep 2025 14:48:43 +0000
Subject: [PATCH 1/5] [MLIR][Vector] Add warp distribution for `scf.if`

---
 .../Vector/Transforms/VectorDistribute.cpp    | 201 ++++++++++++++++++
 .../Vector/vector-warp-distribute.mlir        |  69 ++++++
 2 files changed, 270 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index c84eb2c9f8857..cf5928278aa64 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1713,6 +1713,205 @@ struct WarpOpInsert : public WarpDistributionPattern {
   }
 };
 
+struct WarpOpScfIfOp : public WarpDistributionPattern {
+  WarpOpScfIfOp(MLIRContext *ctx, DistributionMapFn fn, PatternBenefit b = 1)
+      : WarpDistributionPattern(ctx, b), distributionMapFn(std::move(fn)) {}
+  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp warpOpYield = warpOp.getTerminator();
+    // Only pick up `IfOp` if it is the last op in the region.
+    Operation *lastNode = warpOpYield->getPrevNode();
+    auto ifOp = dyn_cast_or_null<scf::IfOp>(lastNode);
+    if (!ifOp)
+      return failure();
+
+    // The current `WarpOp` can yield two types of values:
+    // 1. Not results of `IfOp`:
+    //     Preserve them in the new `WarpOp`.
+    //     Collect their yield index.
+    // 2. Results of `IfOp`:
+    //     They are not part of the new `WarpOp` results.
+    //     Map current warp's yield operand index to `IfOp` result idx.
+    SmallVector<Value> nonIfYieldValues;
+    SmallVector<unsigned> nonIfYieldIndices;
+    llvm::SmallDenseMap<unsigned, unsigned> ifResultMapping;
+    llvm::SmallDenseMap<unsigned, VectorType> ifResultDistTypes;
+    for (OpOperand &yieldOperand : warpOpYield->getOpOperands()) {
+      const unsigned yieldOperandIdx = yieldOperand.getOperandNumber();
+      if (yieldOperand.get().getDefiningOp() != ifOp.getOperation()) {
+        nonIfYieldValues.push_back(yieldOperand.get());
+        nonIfYieldIndices.push_back(yieldOperandIdx);
+        continue;
+      }
+      OpResult ifResult = cast<OpResult>(yieldOperand.get());
+      const unsigned ifResultIdx = ifResult.getResultNumber();
+      ifResultMapping[yieldOperandIdx] = ifResultIdx;
+      // If this `ifOp` result is vector type and it is yielded by the
+      // `WarpOp`, we keep track the distributed type for this result.
+      if (!isa<VectorType>(ifResult.getType()))
+        continue;
+      VectorType distType =
+          cast<VectorType>(warpOp.getResult(yieldOperandIdx).getType());
+      ifResultDistTypes[ifResultIdx] = distType;
+    }
+
+    // Collect `WarpOp`-defined values used in `ifOp`, the new warp op returns
+    // them
+    auto getEscapingValues = [&](Region &branch,
+                                 llvm::SmallSetVector<Value, 32> &values,
+                                 SmallVector<Type> &inputTypes,
+                                 SmallVector<Type> &distTypes) {
+      if (branch.empty())
+        return;
+      mlir::visitUsedValuesDefinedAbove(branch, [&](OpOperand *operand) {
+        Operation *parent = operand->get().getParentRegion()->getParentOp();
+        if (warpOp->isAncestor(parent)) {
+          if (!values.insert(operand->get()))
+            return;
+          Type distType = operand->get().getType();
+          if (auto vecType = dyn_cast<VectorType>(distType)) {
+            AffineMap map = distributionMapFn(operand->get());
+            distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+          }
+          inputTypes.push_back(operand->get().getType());
+          distTypes.push_back(distType);
+        }
+      });
+    };
+    llvm::SmallSetVector<Value, 32> escapingValuesThen;
+    SmallVector<Type> escapingValueInputTypesThen; // inner warp op block args
+    SmallVector<Type> escapingValueDistTypesThen;  // new warp returns
+    getEscapingValues(ifOp.getThenRegion(), escapingValuesThen,
+                      escapingValueInputTypesThen, escapingValueDistTypesThen);
+    llvm::SmallSetVector<Value, 32> escapingValuesElse;
+    SmallVector<Type> escapingValueInputTypesElse; // inner warp op block args
+    SmallVector<Type> escapingValueDistTypesElse;  // new warp returns
+    getEscapingValues(ifOp.getElseRegion(), escapingValuesElse,
+                      escapingValueInputTypesElse, escapingValueDistTypesElse);
+
+    if (llvm::is_contained(escapingValueDistTypesThen, Type{}) ||
+        llvm::is_contained(escapingValueDistTypesElse, Type{}))
+      return failure();
+
+    // The new `WarpOp` groups yields values in following order:
+    // 1. Escaping values then branch
+    // 2. Escaping values else branch
+    // 3. All non-`ifOp` yielded values.
+    SmallVector<Value> newWarpOpYieldValues{escapingValuesThen.begin(),
+                                            escapingValuesThen.end()};
+    newWarpOpYieldValues.append(escapingValuesElse.begin(),
+                                escapingValuesElse.end());
+    SmallVector<Type> newWarpOpDistTypes = escapingValueDistTypesThen;
+    newWarpOpDistTypes.append(escapingValueDistTypesElse.begin(),
+                              escapingValueDistTypesElse.end());
+
+    llvm::SmallDenseMap<unsigned, unsigned> origToNewYieldIdx;
+    for (auto [idx, val] :
+         llvm::zip_equal(nonIfYieldIndices, nonIfYieldValues)) {
+      origToNewYieldIdx[idx] = newWarpOpYieldValues.size();
+      newWarpOpYieldValues.push_back(val);
+      newWarpOpDistTypes.push_back(warpOp.getResult(idx).getType());
+    }
+    // Create the new `WarpOp` with the updated yield values and types.
+    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
+        rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes);
+
+    // `ifOp` returns the result of the inner warp op.
+    SmallVector<Type> newIfOpDistResTypes;
+    for (auto [i, res] : llvm::enumerate(ifOp.getResults())) {
+      Type distType = cast<Value>(res).getType();
+      if (auto vecType = dyn_cast<VectorType>(distType)) {
+        AffineMap map = distributionMapFn(cast<Value>(res));
+        distType = ifResultDistTypes.count(i)
+                       ? ifResultDistTypes[i]
+                       : getDistributedType(vecType, map, warpOp.getWarpSize());
+      }
+      newIfOpDistResTypes.push_back(distType);
+    }
+    // Create a new `IfOp` outside the new `WarpOp` region.
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointAfter(newWarpOp);
+    auto newIfOp = scf::IfOp::create(rewriter, ifOp.getLoc(),
+                                     newIfOpDistResTypes, ifOp.getCondition(),
+                                     static_cast<bool>(ifOp.thenBlock()),
+                                     static_cast<bool>(ifOp.elseBlock()));
+
+    auto processBranch = [&](Block *oldIfBranch, Block *newIfBranch,
+                             llvm::SmallSetVector<Value, 32> &escapingValues,
+                             SmallVector<Type> &escapingValueInputTypes) {
+      OpBuilder::InsertionGuard g(rewriter);
+      if (!newIfBranch)
+        return;
+      rewriter.setInsertionPointToStart(newIfBranch);
+      llvm::SmallDenseMap<Value, int64_t> escapeValToBlockArgIndex;
+      SmallVector<Value> innerWarpInputVals;
+      SmallVector<Type> innerWarpInputTypes;
+      for (size_t i = 0; i < escapingValues.size(); ++i) {
+        innerWarpInputVals.push_back(newWarpOp.getResult(i));
+        escapeValToBlockArgIndex[escapingValues[i]] =
+            innerWarpInputTypes.size();
+        innerWarpInputTypes.push_back(escapingValueInputTypes[i]);
+      }
+      auto innerWarp = WarpExecuteOnLane0Op::create(
+          rewriter, newWarpOp.getLoc(), newIfOp.getResultTypes(),
+          newWarpOp.getLaneid(), newWarpOp.getWarpSize(), innerWarpInputVals,
+          innerWarpInputTypes);
+
+      innerWarp.getWarpRegion().takeBody(*oldIfBranch->getParent());
+      innerWarp.getWarpRegion().addArguments(
+          innerWarpInputTypes,
+          SmallVector<Location>(innerWarpInputTypes.size(), ifOp.getLoc()));
+
+      SmallVector<Value> yieldOperands;
+      for (Value operand : oldIfBranch->getTerminator()->getOperands())
+        yieldOperands.push_back(operand);
+      rewriter.eraseOp(oldIfBranch->getTerminator());
+
+      rewriter.setInsertionPointToEnd(innerWarp.getBody());
+      gpu::YieldOp::create(rewriter, innerWarp.getLoc(), yieldOperands);
+      rewriter.setInsertionPointAfter(innerWarp);
+      scf::YieldOp::create(rewriter, ifOp.getLoc(), innerWarp.getResults());
+
+      // Update any users of escaping values that were forwarded to the
+      // inner `WarpOp`. These values are now arguments of the inner `WarpOp`.
+      innerWarp.walk([&](Operation *op) {
+        for (OpOperand &operand : op->getOpOperands()) {
+          auto it = escapeValToBlockArgIndex.find(operand.get());
+          if (it == escapeValToBlockArgIndex.end())
+            continue;
+          operand.set(innerWarp.getBodyRegion().getArgument(it->second));
+        }
+      });
+      mlir::vector::moveScalarUniformCode(innerWarp);
+    };
+    processBranch(&ifOp.getThenRegion().front(),
+                  &newIfOp.getThenRegion().front(), escapingValuesThen,
+                  escapingValueInputTypesThen);
+    if (!ifOp.getElseRegion().empty())
+      processBranch(&ifOp.getElseRegion().front(),
+                    &newIfOp.getElseRegion().front(), escapingValuesElse,
+                    escapingValueInputTypesElse);
+    // Update the users of `<- WarpOp.yield <- IfOp.yield` to use the new `IfOp`
+    // result.
+    for (auto [origIdx, newIdx] : ifResultMapping)
+      rewriter.replaceAllUsesExcept(warpOp.getResult(origIdx),
+                                    newIfOp.getResult(newIdx), newIfOp);
+    // Similarly, update any users of the `WarpOp` results that were not
+    // results of the `IfOp`.
+    for (auto [origIdx, newIdx] : origToNewYieldIdx)
+      rewriter.replaceAllUsesWith(warpOp.getResult(origIdx),
+                                  newWarpOp.getResult(newIdx));
+    // Remove the original `WarpOp` and `IfOp`, they should not have any uses
+    // at this point.
+    rewriter.eraseOp(ifOp);
+    rewriter.eraseOp(warpOp);
+    return success();
+  }
+
+private:
+  DistributionMapFn distributionMapFn;
+};
+
 /// Sink scf.for region out of WarpExecuteOnLane0Op. This can be done only if
 /// the scf.ForOp is the last operation in the region so that it doesn't
 /// change the order of execution. This creates a new scf.for region after the
@@ -2068,6 +2267,8 @@ void mlir::vector::populatePropagateWarpVectorDistributionPatterns(
                                     benefit);
   patterns.add<WarpOpScfForOp>(patterns.getContext(), distributionMapFn,
                                benefit);
+  patterns.add<WarpOpScfIfOp>(patterns.getContext(), distributionMapFn,
+                              benefit);
 }
 
 void mlir::vector::populateDistributeReduction(
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 8750582ef1e1f..bb7639204022f 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -1856,3 +1856,72 @@ func.func @negative_warp_step_more_than_warp_size(%laneid: index, %buffer: memre
 // CHECK-PROP-LABEL: @negative_warp_step_more_than_warp_size
 // CHECK-PROP-NOT: vector.broadcast
 // CHECK-PROP: vector.step : vector<64xindex>
+
+// -----
+
+func.func @warp_scf_if_no_yield_distribute(%buffer: memref<128xindex>, %pred : i1)  {
+  %laneid = gpu.lane_id
+  %c0 = arith.constant 0 : index
+
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
+    %seq = vector.step : vector<32xindex>
+    scf.if %pred {
+      vector.store %seq, %buffer[%c0] : memref<128xindex>, vector<32xindex>
+    }
+    gpu.yield
+  }
+  return
+}
+
+// CHECK-PROP-LABEL: func.func @warp_scf_if_no_yield_distribute(
+//  CHECK-PROP-SAME:   %[[ARG0:.+]]: memref<128xindex>, %[[ARG1:.+]]: i1
+//       CHECK-PROP:   scf.if %[[ARG1]] {
+//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<1xindex>) {
+//       CHECK-PROP:   ^bb0(%[[ARG2:.+]]: vector<32xindex>):
+//       CHECK-PROP:   vector.store %[[ARG2]], %[[ARG0]][%{{.*}}] : memref<128xindex>, vector<32xindex>
+
+// -----
+
+func.func @warp_scf_if_distribute(%pred : i1)  {
+  %laneid = gpu.lane_id
+  %c0 = arith.constant 0 : index
+
+  %0 = gpu.warp_execute_on_lane_0(%laneid)[32] -> vector<1xf32> {
+    %seq1 = vector.step : vector<32xindex>
+    %seq2 = arith.constant dense<2> : vector<32xindex>
+    %0 = scf.if %pred -> (vector<32xf32>) {
+      %1 = "some_op"(%seq1) : (vector<32xindex>) -> (vector<32xf32>)
+      scf.yield %1 : vector<32xf32>
+    } else {
+      %2 = "other_op"(%seq2) : (vector<32xindex>) -> (vector<32xf32>)
+      scf.yield %2 : vector<32xf32>
+    }
+    gpu.yield %0 : vector<32xf32>
+  }
+  "some_use"(%0) : (vector<1xf32>) -> ()
+
+  return
+}
+
+// CHECK-PROP-LABEL: func.func @warp_scf_if_distribute(
+//  CHECK-PROP-SAME:    %[[ARG0:.+]]: i1
+//       CHECK-PROP:    %[[SEQ2:.+]] = arith.constant dense<2> : vector<32xindex>
+//       CHECK-PROP:    %[[LANE_ID:.+]] = gpu.lane_id
+//       CHECK-PROP:    %[[SEQ1:.+]] = vector.broadcast %[[LANE_ID]] : index to vector<1xindex>
+//       CHECK-PROP:    %[[IF_YIELD_DIST:.+]] = scf.if %[[ARG0]] -> (vector<1xf32>) {
+//       CHECK-PROP:    %[[THEN_DIST:.+]] = gpu.warp_execute_on_lane_0(%[[LANE_ID]])[32] args(%[[SEQ1]] : vector<1xindex>) -> (vector<1xf32>) {
+//       CHECK-PROP:        ^bb0(%[[ARG1:.+]]: vector<32xindex>):
+//       CHECK-PROP:        %{{.*}} = "some_op"(%[[ARG1]]) : (vector<32xindex>) -> vector<32xf32>
+//       CHECK-PROP:        gpu.yield %{{.*}} : vector<32xf32>
+//       CHECK-PROP:      }
+//       CHECK-PROP:      scf.yield %[[THEN_DIST]] : vector<1xf32>
+//       CHECK-PROP:    } else {
+//       CHECK-PROP:      %[[ELSE_DIST:.+]] = gpu.warp_execute_on_lane_0(%[[LANE_ID]])[32] -> (vector<1xf32>) {
+//       CHECK-PROP:        %{{.*}} = "other_op"(%[[SEQ2]]) : (vector<32xindex>) -> vector<32xf32>
+//       CHECK-PROP:        gpu.yield %{{.*}} : vector<32xf32>
+//       CHECK-PROP:      }
+//       CHECK-PROP:      scf.yield %[[ELSE_DIST]] : vector<1xf32>
+//       CHECK-PROP:    }
+//       CHECK-PROP:    "some_use"(%[[IF_YIELD_DIST]]) : (vector<1xf32>) -> ()
+//       CHECK-PROP:    return
+//       CHECK-PROP:  }

>From b356d1119d1053a19e1145e0ff135750009f4cce Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Fri, 5 Sep 2025 16:55:45 +0000
Subject: [PATCH 2/5] Add xegpu tests

---
 .../Dialect/XeGPU/subgroup-distribute.mlir    | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index a39aa90bbe3a8..b57903b2eb69b 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -338,6 +338,64 @@ gpu.module @test {
   }
 }
 
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}) {
+// CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16>
+// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
+// CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) {
+// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16>
+// CHECK-NEXT: } else {
+// CHECK-NEXT:   scf.yield %[[DEFAULT]] : vector<8xf16>
+// CHECK-NEXT: }
+// CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+gpu.module @test {
+  gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>) {
+    %pred = llvm.mlir.poison : i1
+    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
+    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+    %loaded = scf.if %pred -> (vector<16x8xf16>) {
+      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
+        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+      scf.yield %3 : vector<16x8xf16>
+    } else {
+      %3 = arith.constant {
+        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+      } dense<12.> : vector<16x8xf16>
+      scf.yield %3 : vector<16x8xf16>
+    } { layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]> }
+    xegpu.store %loaded, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    gpu.return
+  }
+}
+
+// -----
+// CHECK-LABEL: gpu.func @scatter_ops_scf_non_yield({{.*}}) {
+// CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
+// CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
+// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
+// CHECK: scf.if %[[PREDICATE]] {
+// CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
+// CHECK-NEXT: xegpu.store %[[LOADED]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
+// CHECK-NEXT: }
+gpu.module @test {
+  gpu.func @scatter_ops_scf_non_yield(%src: memref<256xf16>) {
+    %pred = llvm.mlir.poison : i1
+    %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
+    %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
+    scf.if %pred  {
+      %3 = xegpu.load %src[%offset], %1 <{chunk_size=8}> {
+        layout_result_0 = #xegpu.layout<lane_layout = [16, 1], lane_data = [1, 2]>
+      } : memref<256xf16>, vector<16xindex>, vector<16xi1> -> vector<16x8xf16>
+      xegpu.store %3, %src[%offset], %1 <{chunk_size=8}> : vector<16x8xf16>, memref<256xf16>, vector<16xindex>, vector<16xi1>
+    }
+    gpu.return
+  }
+}
+
 // -----
 // CHECK-LABEL: gpu.func @scatter_ops({{.*}}) {
 // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>

>From 7d31574c675ddef0b2f4af310954e822cdadf1ee Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Mon, 8 Sep 2025 08:38:31 +0000
Subject: [PATCH 3/5] Yield if condition, range-based escaping values for
 innerwarps

---
 .../Vector/Transforms/VectorDistribute.cpp    | 36 ++++++++++---------
 .../Dialect/XeGPU/subgroup-distribute.mlir    |  7 ++--
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index cf5928278aa64..db3e9e6922a44 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1794,14 +1794,18 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
       return failure();
 
     // The new `WarpOp` groups yields values in following order:
-    // 1. Escaping values then branch
-    // 2. Escaping values else branch
-    // 3. All non-`ifOp` yielded values.
-    SmallVector<Value> newWarpOpYieldValues{escapingValuesThen.begin(),
-                                            escapingValuesThen.end()};
+    // 1. Branch condition
+    // 2. Escaping values then branch
+    // 3. Escaping values else branch
+    // 4. All non-`ifOp` yielded values.
+    SmallVector<Value> newWarpOpYieldValues{ifOp.getCondition()};
+    newWarpOpYieldValues.append(escapingValuesThen.begin(),
+                                escapingValuesThen.end());
     newWarpOpYieldValues.append(escapingValuesElse.begin(),
                                 escapingValuesElse.end());
-    SmallVector<Type> newWarpOpDistTypes = escapingValueDistTypesThen;
+    SmallVector<Type> newWarpOpDistTypes{ifOp.getCondition().getType()};
+    newWarpOpDistTypes.append(escapingValueDistTypesThen.begin(),
+                              escapingValueDistTypesThen.end());
     newWarpOpDistTypes.append(escapingValueDistTypesElse.begin(),
                               escapingValueDistTypesElse.end());
 
@@ -1815,7 +1819,6 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     // Create the new `WarpOp` with the updated yield values and types.
     WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndReplaceReturns(
         rewriter, warpOp, newWarpOpYieldValues, newWarpOpDistTypes);
-
     // `ifOp` returns the result of the inner warp op.
     SmallVector<Type> newIfOpDistResTypes;
     for (auto [i, res] : llvm::enumerate(ifOp.getResults())) {
@@ -1831,14 +1834,15 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     // Create a new `IfOp` outside the new `WarpOp` region.
     OpBuilder::InsertionGuard g(rewriter);
     rewriter.setInsertionPointAfter(newWarpOp);
-    auto newIfOp = scf::IfOp::create(rewriter, ifOp.getLoc(),
-                                     newIfOpDistResTypes, ifOp.getCondition(),
-                                     static_cast<bool>(ifOp.thenBlock()),
-                                     static_cast<bool>(ifOp.elseBlock()));
+    auto newIfOp = scf::IfOp::create(
+        rewriter, ifOp.getLoc(), newIfOpDistResTypes, newWarpOp.getResult(0),
+        static_cast<bool>(ifOp.thenBlock()),
+        static_cast<bool>(ifOp.elseBlock()));
 
     auto processBranch = [&](Block *oldIfBranch, Block *newIfBranch,
                              llvm::SmallSetVector<Value, 32> &escapingValues,
-                             SmallVector<Type> &escapingValueInputTypes) {
+                             SmallVector<Type> &escapingValueInputTypes,
+                             size_t warpResRangeStart) {
       OpBuilder::InsertionGuard g(rewriter);
       if (!newIfBranch)
         return;
@@ -1846,8 +1850,8 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
       llvm::SmallDenseMap<Value, int64_t> escapeValToBlockArgIndex;
       SmallVector<Value> innerWarpInputVals;
       SmallVector<Type> innerWarpInputTypes;
-      for (size_t i = 0; i < escapingValues.size(); ++i) {
-        innerWarpInputVals.push_back(newWarpOp.getResult(i));
+      for (size_t i = 0; i < escapingValues.size(); ++i, ++warpResRangeStart) {
+        innerWarpInputVals.push_back(newWarpOp.getResult(warpResRangeStart));
         escapeValToBlockArgIndex[escapingValues[i]] =
             innerWarpInputTypes.size();
         innerWarpInputTypes.push_back(escapingValueInputTypes[i]);
@@ -1886,11 +1890,11 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     };
     processBranch(&ifOp.getThenRegion().front(),
                   &newIfOp.getThenRegion().front(), escapingValuesThen,
-                  escapingValueInputTypesThen);
+                  escapingValueInputTypesThen, 1);
     if (!ifOp.getElseRegion().empty())
       processBranch(&ifOp.getElseRegion().front(),
                     &newIfOp.getElseRegion().front(), escapingValuesElse,
-                    escapingValueInputTypesElse);
+                    escapingValueInputTypesElse, 1 + escapingValuesThen.size());
     // Update the users of `<- WarpOp.yield <- IfOp.yield` to use the new `IfOp`
     // result.
     for (auto [origIdx, newIdx] : ifResultMapping)
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index b57903b2eb69b..60acea06c9a12 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -339,11 +339,11 @@ gpu.module @test {
 }
 
 // -----
-// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}}) {
+// CHECK-LABEL: gpu.func @scatter_ops_scf_yield({{.*}},
+// CHECK-SAME: %[[PREDICATE:.*]]: i1) {
 // CHECK: %[[DEFAULT:.*]] = arith.constant dense<1.200000e+01> : vector<8xf16>
 // CHECK: %[[OFFSET:.*]] = arith.constant dense<12> : vector<1xindex>
 // CHECK: %[[MASK:.*]] = arith.constant dense<true> : vector<1xi1>
-// CHECK: %[[PREDICATE:.*]] = llvm.mlir.poison : i1
 // CHECK: %[[PREDICATED_LOAD:.*]] = scf.if %[[PREDICATE]] -> (vector<8xf16>) {
 // CHECK-NEXT: %[[LOADED:.*]] = xegpu.load %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : memref<256xf16>, vector<1xindex>, vector<1xi1> -> vector<8xf16>
 // CHECK-NEXT: scf.yield %[[LOADED]] : vector<8xf16>
@@ -352,8 +352,7 @@ gpu.module @test {
 // CHECK-NEXT: }
 // CHECK-NEXT: xegpu.store %[[PREDICATED_LOAD]], %arg0[%[[OFFSET]]], %[[MASK]] <{chunk_size = 8 : i64}> : vector<8xf16>, memref<256xf16>, vector<1xindex>, vector<1xi1>
 gpu.module @test {
-  gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>) {
-    %pred = llvm.mlir.poison : i1
+  gpu.func @scatter_ops_scf_yield(%src: memref<256xf16>, %pred : i1) {
     %1 = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<1>: vector<16xi1>
     %offset = arith.constant {layout_result_0 = #xegpu.layout<lane_layout = [16], lane_data = [1]>} dense<12> : vector<16xindex>
     %loaded = scf.if %pred -> (vector<16x8xf16>) {

>From 784dda109ca2c145b1b9f92f87547c54af43dcc7 Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Tue, 9 Sep 2025 10:16:03 +0000
Subject: [PATCH 4/5] Address feedback

---
 .../Vector/Transforms/VectorDistribute.cpp    | 214 ++++++++++--------
 1 file changed, 125 insertions(+), 89 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index db3e9e6922a44..3ae866aeb2888 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -371,6 +371,36 @@ static VectorType getDistributedType(VectorType originalType, AffineMap map,
   return targetType;
 }
 
+/// Given a warpOp that contains ops with regions, the corresponding op's
+/// "inner" region and the distributionMapFn, get all values used by the op's
+/// region that are defined within the warpOp. Return the set of values, their
+/// types and their distributed types.
+std::tuple<llvm::SmallSetVector<Value, 32>, SmallVector<Type>,
+           SmallVector<Type>>
+getInnerRegionEscapingValues(WarpExecuteOnLane0Op warpOp, Region &innerRegion,
+                             DistributionMapFn distributionMapFn) {
+  llvm::SmallSetVector<Value, 32> escapingValues;
+  SmallVector<Type> escapingValueTypes;
+  SmallVector<Type> escapingValueDistTypes; // to yield from the new warpOp
+  if (innerRegion.empty())
+    return {escapingValues, escapingValueTypes, escapingValueDistTypes};
+  mlir::visitUsedValuesDefinedAbove(innerRegion, [&](OpOperand *operand) {
+    Operation *parent = operand->get().getParentRegion()->getParentOp();
+    if (warpOp->isAncestor(parent)) {
+      if (!escapingValues.insert(operand->get()))
+        return;
+      Type distType = operand->get().getType();
+      if (auto vecType = dyn_cast<VectorType>(distType)) {
+        AffineMap map = distributionMapFn(operand->get());
+        distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+      }
+      escapingValueTypes.push_back(operand->get().getType());
+      escapingValueDistTypes.push_back(distType);
+    }
+  });
+  return {escapingValues, escapingValueTypes, escapingValueDistTypes};
+}
+
 /// Distribute transfer_write ops based on the affine map returned by
 /// `distributionMapFn`. Writes of size more than `maxNumElementToExtract`
 /// will not be distributed (it should be less than the warp size).
@@ -1713,6 +1743,32 @@ struct WarpOpInsert : public WarpDistributionPattern {
   }
 };
 
+/// Sink scf.if out of WarpExecuteOnLane0Op. This can be done only if
+/// the scf.if is the last operation in the region so that it doesn't
+/// change the order of execution. This creates a new scf.if after the
+/// WarpExecuteOnLane0Op. Each branch of the new scf.if is enclosed in
+/// the "inner" WarpExecuteOnLane0Op. Example:
+/// ```
+/// gpu.warp_execute_on_lane_0(%laneid)[32] {
+///   %payload = ... : vector<32xindex>
+///   scf.if %pred {
+///     vector.store %payload, %buffer[%idx] : memref<128xindex>,
+///     vector<32xindex>
+///   }
+///   gpu.yield
+/// }
+/// ```
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] {
+///   %payload = ... : vector<32xindex>
+///   gpu.yield %payload : vector<32xindex>
+/// }
+/// scf.if %pred {
+///   gpu.warp_execute_on_lane_0(%laneid)[32] args(%r : vector<1xindex>) {
+///     ^bb0(%arg1: vector<32xindex>):
+///     vector.store %arg1, %buffer[%idx] : memref<128xindex>, vector<32xindex>
+///   }
+/// }
+/// ```
 struct WarpOpScfIfOp : public WarpDistributionPattern {
   WarpOpScfIfOp(MLIRContext *ctx, DistributionMapFn fn, PatternBenefit b = 1)
       : WarpDistributionPattern(ctx, b), distributionMapFn(std::move(fn)) {}
@@ -1728,7 +1784,7 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
     // The current `WarpOp` can yield two types of values:
     // 1. Not results of `IfOp`:
     //     Preserve them in the new `WarpOp`.
-    //     Collect their yield index.
+    //     Collect their yield index to remap the usages.
     // 2. Results of `IfOp`:
     //     They are not part of the new `WarpOp` results.
     //     Map current warp's yield operand index to `IfOp` result idx.
@@ -1757,38 +1813,14 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
 
     // Collect `WarpOp`-defined values used in `ifOp`, the new warp op returns
     // them
-    auto getEscapingValues = [&](Region &branch,
-                                 llvm::SmallSetVector<Value, 32> &values,
-                                 SmallVector<Type> &inputTypes,
-                                 SmallVector<Type> &distTypes) {
-      if (branch.empty())
-        return;
-      mlir::visitUsedValuesDefinedAbove(branch, [&](OpOperand *operand) {
-        Operation *parent = operand->get().getParentRegion()->getParentOp();
-        if (warpOp->isAncestor(parent)) {
-          if (!values.insert(operand->get()))
-            return;
-          Type distType = operand->get().getType();
-          if (auto vecType = dyn_cast<VectorType>(distType)) {
-            AffineMap map = distributionMapFn(operand->get());
-            distType = getDistributedType(vecType, map, warpOp.getWarpSize());
-          }
-          inputTypes.push_back(operand->get().getType());
-          distTypes.push_back(distType);
-        }
-      });
-    };
-    llvm::SmallSetVector<Value, 32> escapingValuesThen;
-    SmallVector<Type> escapingValueInputTypesThen; // inner warp op block args
-    SmallVector<Type> escapingValueDistTypesThen;  // new warp returns
-    getEscapingValues(ifOp.getThenRegion(), escapingValuesThen,
-                      escapingValueInputTypesThen, escapingValueDistTypesThen);
-    llvm::SmallSetVector<Value, 32> escapingValuesElse;
-    SmallVector<Type> escapingValueInputTypesElse; // inner warp op block args
-    SmallVector<Type> escapingValueDistTypesElse;  // new warp returns
-    getEscapingValues(ifOp.getElseRegion(), escapingValuesElse,
-                      escapingValueInputTypesElse, escapingValueDistTypesElse);
-
+    auto [escapingValuesThen, escapingValueInputTypesThen,
+          escapingValueDistTypesThen] =
+        getInnerRegionEscapingValues(warpOp, ifOp.getThenRegion(),
+                                     distributionMapFn);
+    auto [escapingValuesElse, escapingValueInputTypesElse,
+          escapingValueDistTypesElse] =
+        getInnerRegionEscapingValues(warpOp, ifOp.getElseRegion(),
+                                     distributionMapFn);
     if (llvm::is_contained(escapingValueDistTypesThen, Type{}) ||
         llvm::is_contained(escapingValueDistTypesElse, Type{}))
       return failure();
@@ -1825,6 +1857,7 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
       Type distType = cast<Value>(res).getType();
       if (auto vecType = dyn_cast<VectorType>(distType)) {
         AffineMap map = distributionMapFn(cast<Value>(res));
+        // Fallback to affine map if the dist result was not previously recorded
         distType = ifResultDistTypes.count(i)
                        ? ifResultDistTypes[i]
                        : getDistributedType(vecType, map, warpOp.getWarpSize());
@@ -1838,63 +1871,66 @@ struct WarpOpScfIfOp : public WarpDistributionPattern {
         rewriter, ifOp.getLoc(), newIfOpDistResTypes, newWarpOp.getResult(0),
         static_cast<bool>(ifOp.thenBlock()),
         static_cast<bool>(ifOp.elseBlock()));
-
-    auto processBranch = [&](Block *oldIfBranch, Block *newIfBranch,
-                             llvm::SmallSetVector<Value, 32> &escapingValues,
-                             SmallVector<Type> &escapingValueInputTypes,
-                             size_t warpResRangeStart) {
-      OpBuilder::InsertionGuard g(rewriter);
-      if (!newIfBranch)
-        return;
-      rewriter.setInsertionPointToStart(newIfBranch);
-      llvm::SmallDenseMap<Value, int64_t> escapeValToBlockArgIndex;
-      SmallVector<Value> innerWarpInputVals;
-      SmallVector<Type> innerWarpInputTypes;
-      for (size_t i = 0; i < escapingValues.size(); ++i, ++warpResRangeStart) {
-        innerWarpInputVals.push_back(newWarpOp.getResult(warpResRangeStart));
-        escapeValToBlockArgIndex[escapingValues[i]] =
-            innerWarpInputTypes.size();
-        innerWarpInputTypes.push_back(escapingValueInputTypes[i]);
-      }
-      auto innerWarp = WarpExecuteOnLane0Op::create(
-          rewriter, newWarpOp.getLoc(), newIfOp.getResultTypes(),
-          newWarpOp.getLaneid(), newWarpOp.getWarpSize(), innerWarpInputVals,
-          innerWarpInputTypes);
-
-      innerWarp.getWarpRegion().takeBody(*oldIfBranch->getParent());
-      innerWarp.getWarpRegion().addArguments(
-          innerWarpInputTypes,
-          SmallVector<Location>(innerWarpInputTypes.size(), ifOp.getLoc()));
-
-      SmallVector<Value> yieldOperands;
-      for (Value operand : oldIfBranch->getTerminator()->getOperands())
-        yieldOperands.push_back(operand);
-      rewriter.eraseOp(oldIfBranch->getTerminator());
-
-      rewriter.setInsertionPointToEnd(innerWarp.getBody());
-      gpu::YieldOp::create(rewriter, innerWarp.getLoc(), yieldOperands);
-      rewriter.setInsertionPointAfter(innerWarp);
-      scf::YieldOp::create(rewriter, ifOp.getLoc(), innerWarp.getResults());
-
-      // Update any users of escaping values that were forwarded to the
-      // inner `WarpOp`. These values are now arguments of the inner `WarpOp`.
-      innerWarp.walk([&](Operation *op) {
-        for (OpOperand &operand : op->getOpOperands()) {
-          auto it = escapeValToBlockArgIndex.find(operand.get());
-          if (it == escapeValToBlockArgIndex.end())
-            continue;
-          operand.set(innerWarp.getBodyRegion().getArgument(it->second));
-        }
-      });
-      mlir::vector::moveScalarUniformCode(innerWarp);
-    };
-    processBranch(&ifOp.getThenRegion().front(),
-                  &newIfOp.getThenRegion().front(), escapingValuesThen,
-                  escapingValueInputTypesThen, 1);
+    auto encloseRegionInWarpOp =
+        [&](Block *oldIfBranch, Block *newIfBranch,
+            llvm::SmallSetVector<Value, 32> &escapingValues,
+            SmallVector<Type> &escapingValueInputTypes,
+            size_t warpResRangeStart) {
+          OpBuilder::InsertionGuard g(rewriter);
+          if (!newIfBranch)
+            return;
+          rewriter.setInsertionPointToStart(newIfBranch);
+          llvm::SmallDenseMap<Value, int64_t> escapeValToBlockArgIndex;
+          SmallVector<Value> innerWarpInputVals;
+          SmallVector<Type> innerWarpInputTypes;
+          for (size_t i = 0; i < escapingValues.size();
+               ++i, ++warpResRangeStart) {
+            innerWarpInputVals.push_back(
+                newWarpOp.getResult(warpResRangeStart));
+            escapeValToBlockArgIndex[escapingValues[i]] =
+                innerWarpInputTypes.size();
+            innerWarpInputTypes.push_back(escapingValueInputTypes[i]);
+          }
+          auto innerWarp = WarpExecuteOnLane0Op::create(
+              rewriter, newWarpOp.getLoc(), newIfOp.getResultTypes(),
+              newWarpOp.getLaneid(), newWarpOp.getWarpSize(),
+              innerWarpInputVals, innerWarpInputTypes);
+
+          innerWarp.getWarpRegion().takeBody(*oldIfBranch->getParent());
+          innerWarp.getWarpRegion().addArguments(
+              innerWarpInputTypes,
+              SmallVector<Location>(innerWarpInputTypes.size(), ifOp.getLoc()));
+
+          SmallVector<Value> yieldOperands;
+          for (Value operand : oldIfBranch->getTerminator()->getOperands())
+            yieldOperands.push_back(operand);
+          rewriter.eraseOp(oldIfBranch->getTerminator());
+
+          rewriter.setInsertionPointToEnd(innerWarp.getBody());
+          gpu::YieldOp::create(rewriter, innerWarp.getLoc(), yieldOperands);
+          rewriter.setInsertionPointAfter(innerWarp);
+          scf::YieldOp::create(rewriter, ifOp.getLoc(), innerWarp.getResults());
+
+          // Update any users of escaping values that were forwarded to the
+          // inner `WarpOp`. These values are arguments of the inner `WarpOp`.
+          innerWarp.walk([&](Operation *op) {
+            for (OpOperand &operand : op->getOpOperands()) {
+              auto it = escapeValToBlockArgIndex.find(operand.get());
+              if (it == escapeValToBlockArgIndex.end())
+                continue;
+              operand.set(innerWarp.getBodyRegion().getArgument(it->second));
+            }
+          });
+          mlir::vector::moveScalarUniformCode(innerWarp);
+        };
+    encloseRegionInWarpOp(&ifOp.getThenRegion().front(),
+                          &newIfOp.getThenRegion().front(), escapingValuesThen,
+                          escapingValueInputTypesThen, 1);
     if (!ifOp.getElseRegion().empty())
-      processBranch(&ifOp.getElseRegion().front(),
-                    &newIfOp.getElseRegion().front(), escapingValuesElse,
-                    escapingValueInputTypesElse, 1 + escapingValuesThen.size());
+      encloseRegionInWarpOp(&ifOp.getElseRegion().front(),
+                            &newIfOp.getElseRegion().front(),
+                            escapingValuesElse, escapingValueInputTypesElse,
+                            1 + escapingValuesThen.size());
     // Update the users of `<- WarpOp.yield <- IfOp.yield` to use the new `IfOp`
     // result.
     for (auto [origIdx, newIdx] : ifResultMapping)

>From 90ef1ab5008f808e50ef498d74ca5218147bd85d Mon Sep 17 00:00:00 2001
From: Artem Kroviakov <artem.kroviakov at intel.com>
Date: Wed, 10 Sep 2025 08:16:46 +0000
Subject: [PATCH 5/5] Address feedback

---
 .../Vector/Transforms/VectorDistribute.cpp    | 32 ++++++-------------
 1 file changed, 9 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 3ae866aeb2888..995a2595e5fbb 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -373,8 +373,8 @@ static VectorType getDistributedType(VectorType originalType, AffineMap map,
 
 /// Given a warpOp that contains ops with regions, the corresponding op's
 /// "inner" region and the distributionMapFn, get all values used by the op's
-/// region that are defined within the warpOp. Return the set of values, their
-/// types and their distributed types.
+/// region that are defined within the warpOp, but outside the inner region.
+/// Return the set of values, their types and their distributed types.
 std::tuple<llvm::SmallSetVector<Value, 32>, SmallVector<Type>,
            SmallVector<Type>>
 getInnerRegionEscapingValues(WarpExecuteOnLane0Op warpOp, Region &innerRegion,
@@ -383,7 +383,8 @@ getInnerRegionEscapingValues(WarpExecuteOnLane0Op warpOp, Region &innerRegion,
   SmallVector<Type> escapingValueTypes;
   SmallVector<Type> escapingValueDistTypes; // to yield from the new warpOp
   if (innerRegion.empty())
-    return {escapingValues, escapingValueTypes, escapingValueDistTypes};
+    return {std::move(escapingValues), std::move(escapingValueTypes),
+            std::move(escapingValueDistTypes)};
   mlir::visitUsedValuesDefinedAbove(innerRegion, [&](OpOperand *operand) {
     Operation *parent = operand->get().getParentRegion()->getParentOp();
     if (warpOp->isAncestor(parent)) {
@@ -398,7 +399,8 @@ getInnerRegionEscapingValues(WarpExecuteOnLane0Op warpOp, Region &innerRegion,
       escapingValueDistTypes.push_back(distType);
     }
   });
-  return {escapingValues, escapingValueTypes, escapingValueDistTypes};
+  return {std::move(escapingValues), std::move(escapingValueTypes),
+          std::move(escapingValueDistTypes)};
 }
 
 /// Distribute transfer_write ops based on the affine map returned by
@@ -1998,25 +2000,9 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
       return failure();
     // Collect Values that come from the `WarpOp` but are outside the `ForOp`.
     // Those Values need to be returned by the new warp op.
-    llvm::SmallSetVector<Value, 32> escapingValues;
-    SmallVector<Type> escapingValueInputTypes;
-    SmallVector<Type> escapingValueDistTypes;
-    mlir::visitUsedValuesDefinedAbove(
-        forOp.getBodyRegion(), [&](OpOperand *operand) {
-          Operation *parent = operand->get().getParentRegion()->getParentOp();
-          if (warpOp->isAncestor(parent)) {
-            if (!escapingValues.insert(operand->get()))
-              return;
-            Type distType = operand->get().getType();
-            if (auto vecType = dyn_cast<VectorType>(distType)) {
-              AffineMap map = distributionMapFn(operand->get());
-              distType = getDistributedType(vecType, map, warpOp.getWarpSize());
-            }
-            escapingValueInputTypes.push_back(operand->get().getType());
-            escapingValueDistTypes.push_back(distType);
-          }
-        });
-
+    auto [escapingValues, escapingValueInputTypes, escapingValueDistTypes] =
+        getInnerRegionEscapingValues(warpOp, forOp.getBodyRegion(),
+                                     distributionMapFn);
     if (llvm::is_contained(escapingValueDistTypes, Type{}))
       return failure();
     // `WarpOp` can yield two types of values: