[Mlir-commits] [mlir] [mlir][xegpu] Refine layout assignment in XeGPU SIMT distribution. (PR #142687)

Tue Jun 3 17:01:52 PDT 2025

https://github.com/charithaintc updated https://github.com/llvm/llvm-project/pull/142687

>From ff1012e2208ef866a0313289d4bf6e130d1a0eaf Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 27 May 2025 23:40:57 +0000
Subject: [PATCH 01/10] add bug fix

---
 .../Vector/Transforms/VectorDistribute.cpp    | 42 ++++++++++++++-----
 1 file changed, 32 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 045c192787f10..1649fb5f91b42 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -15,10 +15,13 @@
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/RegionUtils.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
 #include <utility>
 
 using namespace mlir;
@@ -1554,22 +1557,36 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
     llvm::SmallSetVector<Value, 32> escapingValues;
     SmallVector<Type> inputTypes;
     SmallVector<Type> distTypes;
+    auto collectEscapingValues = [&](Value value) {
+      if (!escapingValues.insert(value))
+        return;
+      Type distType = value.getType();
+      if (auto vecType = dyn_cast<VectorType>(distType)) {
+        AffineMap map = distributionMapFn(value);
+        distType = getDistributedType(vecType, map, warpOp.getWarpSize());
+      }
+      inputTypes.push_back(value.getType());
+      distTypes.push_back(distType);
+    };
+
     mlir::visitUsedValuesDefinedAbove(
         forOp.getBodyRegion(), [&](OpOperand *operand) {
           Operation *parent = operand->get().getParentRegion()->getParentOp();
           if (warpOp->isAncestor(parent)) {
-            if (!escapingValues.insert(operand->get()))
-              return;
-            Type distType = operand->get().getType();
-            if (auto vecType = dyn_cast<VectorType>(distType)) {
-              AffineMap map = distributionMapFn(operand->get());
-              distType = getDistributedType(vecType, map, warpOp.getWarpSize());
-            }
-            inputTypes.push_back(operand->get().getType());
-            distTypes.push_back(distType);
+            collectEscapingValues(operand->get());
           }
         });
 
+    // Any forOp result that is not already yielded by the warpOp
+    // region is also considered escaping.
+    for (OpResult forResult : forOp.getResults()) {
+      // Check if this forResult is already yielded by the yield op.
+      if (llvm::is_contained(yield->getOperands(), forResult)) {
+        continue;
+      }
+      collectEscapingValues(forResult);
+    }
+
     if (llvm::is_contained(distTypes, Type{}))
       return failure();
 
@@ -1609,7 +1626,12 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
                                     forOp.getResultTypes().end());
     llvm::SmallDenseMap<Value, int64_t> argIndexMapping;
     for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) {
-      warpInput.push_back(newWarpOp.getResult(retIdx));
+      auto newWarpResult = newWarpOp.getResult(retIdx);
+      // Unused forOp results yielded by the warpOp region are already included
+      // in the new ForOp.
+      if (llvm::is_contained(newOperands, newWarpResult))
+        continue;
+      warpInput.push_back(newWarpResult);
       argIndexMapping[escapingValues[i]] = warpInputType.size();
       warpInputType.push_back(inputTypes[i]);
     }

>From c6eb53fefded7152c2d627c4094b66f616bc53ed Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 28 May 2025 20:22:47 +0000
Subject: [PATCH 02/10] add test

---
 .../Vector/vector-warp-distribute.mlir        | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index 38771f2593449..6c7ac7a5196a7 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -584,6 +584,42 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
   return
 }
 
+// -----
+// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield(
+//       CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32>
+//       CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
+//       CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
+//       CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32>
+//       CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32>
+//       CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32>
+//       CHECK-PROP: }
+//       CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> ()
+func.func @warp_scf_for_unused_yield(%arg0: index) {
+  %c128 = arith.constant 128 : index
+  %c1 = arith.constant 1 : index
+  %c0 = arith.constant 0 : index
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
+    %ini = "some_def"() : () -> (vector<128xf32>)
+    %ini1 = "some_def"() : () -> (vector<128xf32>)
+    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) {
+      %add = arith.addi %arg3, %c1 : index
+      %1  = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>)
+      %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
+      scf.yield %acc, %1 : vector<128xf32>, vector<128xf32>
+    }
+    gpu.yield %3#0 : vector<128xf32>
+  }
+  "some_use"(%0) : (vector<4xf32>) -> ()
+  return
+}
+
+
 // -----
 
 // CHECK-PROP-LABEL: func @vector_reduction(

>From 3bdb5961d48bf70b63560820375d24e0682dbff8 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 28 May 2025 20:26:01 +0000
Subject: [PATCH 03/10] add comments

---
 mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 1649fb5f91b42..94435588459e6 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -1578,7 +1578,8 @@ struct WarpOpScfForOp : public WarpDistributionPattern {
         });
 
     // Any forOp result that is not already yielded by the warpOp
-    // region is also considered escaping.
+    // region is also considered escaping and must be returned by the
+    // original warpOp.
     for (OpResult forResult : forOp.getResults()) {
       // Check if this forResult is already yielded by the yield op.
       if (llvm::is_contained(yield->getOperands(), forResult)) {

>From fe3ab99da99bfe47dd257a458d01ddd4e24df63e Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 28 May 2025 21:32:04 +0000
Subject: [PATCH 04/10] remove unsused headers

---
 mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 94435588459e6..bd833ddb773f7 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -15,13 +15,10 @@
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Transforms/RegionUtils.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
 #include <utility>
 
 using namespace mlir;

>From f91b64c88ef893a9a7d620cd76345c21a4a46d33 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 2 Jun 2025 18:24:58 +0000
Subject: [PATCH 05/10] save work

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 218 +++++++++++++-----
 1 file changed, 164 insertions(+), 54 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 992700524146a..d178c2c33245e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -12,6 +12,8 @@
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/GPU/Utils/DistributionUtils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Patterns.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
@@ -30,6 +32,7 @@
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -38,6 +41,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/InterleavedRange.h"
+#include "llvm/Support/LogicalResult.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace mlir {
@@ -701,7 +705,47 @@ namespace {
 //===----------------------------------------------------------------------===//
 // LayoutAttrAssignment
 //===----------------------------------------------------------------------===//
+template <typename OpTy>
+class UpdateTensorDescType : public OpConversionPattern<OpTy> {
+public:
+  UpdateTensorDescType(MLIRContext *context,
+                       function_ref<xegpu::LayoutAttr(Value)> getLayoutOfValue,
+                       TypeConverter &typeConverter, PatternBenefit benefit = 1)
+      : OpConversionPattern<OpTy>(typeConverter, context, benefit),
+        getLayoutOfValue(getLayoutOfValue) {}
+  using OpConversionPattern<OpTy>::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Op must have single result.
+    if (op->getNumResults() != 1)
+      return failure();
+    Type resultType = op->getResult(0).getType();
+    // Result type must be a tensor descriptor type.
+    if (!isa<xegpu::TensorDescType>(resultType)) {
+      LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: "
+                        << resultType << "\n");
+      return failure();
+    }
+    auto assignedLayout = getLayoutOfValue(op.getResult());
+    if (!assignedLayout) {
+      LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n");
+      return failure();
+    }
+    // Get the original tensor descriptor type.
+    auto origTensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType);
+    auto newTensorDescTy = xegpu::TensorDescType::get(
+        origTensorDescTy.getContext(), origTensorDescTy.getShape(),
+        origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(),
+        assignedLayout);
+    rewriter.replaceOpWithNewOp<OpTy>(op, newTensorDescTy,
+                                      adaptor.getOperands(), op->getAttrs());
+    return success();
+  }
 
+private:
+  function_ref<xegpu::LayoutAttr(Value)> getLayoutOfValue;
+};
 /// This class is responsible for assigning the layout attributes to the ops and
 /// their users based on the layout propagation analysis result.
 class LayoutAttrAssignment {
@@ -739,15 +783,19 @@ void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
 
 /// Convert the layout assigned to a value to xegpu::LayoutAttr.
 xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+  llvm::errs() << "getLayoutAttrForValue: " << v << "\n";
   LayoutInfo layout = getAnalysisResult(v);
-  if (!layout.isAssigned())
+  if (!layout.isAssigned()) {
+    llvm::errs() << "No layout assigned for value\n";
     return {};
+  }
   SmallVector<int, 2> laneLayout, laneData;
   for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
                                              layout.getDataAsArrayRef())) {
     laneLayout.push_back(static_cast<int>(layout));
     laneData.push_back(static_cast<int>(data));
   }
+  llvm::errs() << "return layout\n";
   return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
 }
 
@@ -820,14 +868,23 @@ LogicalResult LayoutAttrAssignment::assign(Operation *op) {
 
 /// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
 LogicalResult LayoutAttrAssignment::run() {
-  auto walkResult = top->walk([&](Operation *op) {
-    if (failed(assign(op)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
-  });
-
-  if (walkResult.wasInterrupted())
-    return failure();
+  // auto walkResult = top->walk([&](Operation *op) {
+  //   if (failed(assign(op)))
+  //     return WalkResult::interrupt();
+  //   return WalkResult::advance();
+  // });
+
+  // if (walkResult.wasInterrupted())
+  //   return failure();
+  // apply the UpdateTensorDescType pattern to all ops
+  // RewritePatternSet patterns(top->getContext());
+  // patterns.add<UpdateTensorDescType>(
+  //     top->getContext(), [&](Value v) -> xegpu::LayoutAttr {
+  //       llvm::errs() << "invoking callback for value\n";
+  //       return getLayoutAttrForValue(v);
+  //     });
+  // if (failed(applyPatternsGreedily(top, std::move(patterns))))
+  //   return failure();
 
   return resolveConflicts();
 }
@@ -1597,56 +1654,109 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     analyis.printAnalysisResult(os);
     return;
   }
-  auto getPropagatedLayout = [&](Value val) {
-    return analyis.getLayoutInfo(val);
+  // auto getPropagatedLayout = [&](Value val) {
+  //   return analyis.getLayoutInfo(val);
+  // };
+  auto getXeGpuLayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
+    LayoutInfo layout = analyis.getLayoutInfo(val);
+    if (!layout.isAssigned()) {
+      llvm::errs() << "No layout assigned for value\n";
+      return {};
+    }
+    SmallVector<int, 2> laneLayout, laneData;
+    for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                               layout.getDataAsArrayRef())) {
+      laneLayout.push_back(static_cast<int>(layout));
+      laneData.push_back(static_cast<int>(data));
+    }
+    return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData);
+  };
+
+  ConversionTarget target(getContext());
+  target.addDynamicallyLegalOp<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(
+      [&](Operation *op) {
+        return llvm::all_of(op->getResults(), [&](Value val) {
+          if (auto descType = dyn_cast<xegpu::TensorDescType>(val.getType())) {
+            return descType.getLayoutAttr() != nullptr;
+          }
+          return true; // Non-tensor descriptor types are always legal.
+        });
+      });
+  target.addLegalOp<UnrealizedConversionCastOp>();
+  TypeConverter typeConverter;
+  typeConverter.addConversion([](Type type) { return type; });
+  // // typeConverter.addConversion([](xegpu::TensorDescType type) {
+  // //   return xegpu::TensorDescType::get(
+  // //       type.getContext(), type.getShape(), type.getElementType(),
+  // //       type.getEncoding(),
+  // //       xegpu::LayoutAttr::get(type.getContext(), {1, 1}, {1, 1}));
+  // // });
+  auto addUnrealizedCast = [](OpBuilder &builder, Type type, ValueRange inputs,
+                              Location loc) -> Value {
+    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    return cast.getResult(0);
   };
 
+  typeConverter.addSourceMaterialization(addUnrealizedCast);
+  typeConverter.addTargetMaterialization(addUnrealizedCast);
+
+  RewritePatternSet patterns(&getContext());
+  patterns.add<UpdateTensorDescType<xegpu::CreateNdDescOp>,
+               UpdateTensorDescType<xegpu::UpdateNdOffsetOp>>(
+      &getContext(), getXeGpuLayoutForValue, typeConverter);
+  if (failed(
+          applyPartialConversion(getOperation(), target, std::move(patterns))))
+    signalPassFailure();
+
   // Assign xegpu::LayoutAttr to all ops and their users based on the layout
   // propagation analysis result.
-  LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
-  if (failed(layoutAssignment.run())) {
-    signalPassFailure();
-    return;
-  }
+  // LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
+  // if (failed(layoutAssignment.run())) {
+  //   signalPassFailure();
+  //   return;
+  // }
 
   // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
   // operation.
-  {
-    RewritePatternSet patterns(&getContext());
-    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-    // At this point, we have moved the entire function body inside the warpOp.
-    // Now move any scalar uniform code outside of the warpOp (like GPU index
-    // ops, scalar constants, etc.). This will simplify the later lowering and
-    // avoid custom patterns for these ops.
-    getOperation()->walk([&](Operation *op) {
-      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
-        vector::moveScalarUniformCode(warpOp);
-      }
-    });
-  }
-  // Finally, do the SIMD to SIMT distribution.
-  RewritePatternSet patterns(&getContext());
-  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  // TODO: distributionFn and shuffleFn are not used at this point.
-  auto distributionFn = [](Value val) {
-    VectorType vecType = dyn_cast<VectorType>(val.getType());
-    int64_t vecRank = vecType ? vecType.getRank() : 0;
-    OpBuilder builder(val.getContext());
-    if (vecRank == 0)
-      return AffineMap::get(val.getContext());
-    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
-  };
-  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
-                      int64_t warpSz) { return Value(); };
-  vector::populatePropagateWarpVectorDistributionPatterns(
-      patterns, distributionFn, shuffleFn);
-  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-    signalPassFailure();
-    return;
-  }
+  // {
+  //   RewritePatternSet patterns(&getContext());
+  //   patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+
+  //   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  //     signalPassFailure();
+  //     return;
+  //   }
+  //   // At this point, we have moved the entire function body inside the
+  //   warpOp.
+  //   // Now move any scalar uniform code outside of the warpOp (like GPU index
+  //   // ops, scalar constants, etc.). This will simplify the later lowering
+  //   and
+  //   // avoid custom patterns for these ops.
+  //   getOperation()->walk([&](Operation *op) {
+  //     if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
+  //       vector::moveScalarUniformCode(warpOp);
+  //     }
+  //   });
+  // }
+  // // Finally, do the SIMD to SIMT distribution.
+  // RewritePatternSet patterns(&getContext());
+  // xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+  // // TODO: distributionFn and shuffleFn are not used at this point.
+  // auto distributionFn = [](Value val) {
+  //   VectorType vecType = dyn_cast<VectorType>(val.getType());
+  //   int64_t vecRank = vecType ? vecType.getRank() : 0;
+  //   OpBuilder builder(val.getContext());
+  //   if (vecRank == 0)
+  //     return AffineMap::get(val.getContext());
+  //   return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+  // };
+  // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value
+  // srcIdx,
+  //                     int64_t warpSz) { return Value(); };
+  // vector::populatePropagateWarpVectorDistributionPatterns(
+  //     patterns, distributionFn, shuffleFn);
+  // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  //   signalPassFailure();
+  //   return;
+  // }
 }

>From 5cacace6c3f56f3d84b2a63003c2f3d9947b195a Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Mon, 2 Jun 2025 22:57:27 +0000
Subject: [PATCH 06/10] initial version

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 487 ++++++++++--------
 1 file changed, 267 insertions(+), 220 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index d178c2c33245e..aa982ae779d1e 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -32,6 +32,7 @@
 #include "mlir/IR/Value.h"
 #include "mlir/IR/Visitors.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/InliningUtils.h"
@@ -700,203 +701,264 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
-namespace {
+// namespace {
 
 //===----------------------------------------------------------------------===//
 // LayoutAttrAssignment
 //===----------------------------------------------------------------------===//
-template <typename OpTy>
-class UpdateTensorDescType : public OpConversionPattern<OpTy> {
-public:
-  UpdateTensorDescType(MLIRContext *context,
-                       function_ref<xegpu::LayoutAttr(Value)> getLayoutOfValue,
-                       TypeConverter &typeConverter, PatternBenefit benefit = 1)
-      : OpConversionPattern<OpTy>(typeConverter, context, benefit),
-        getLayoutOfValue(getLayoutOfValue) {}
-  using OpConversionPattern<OpTy>::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    // Op must have single result.
-    if (op->getNumResults() != 1)
-      return failure();
-    Type resultType = op->getResult(0).getType();
-    // Result type must be a tensor descriptor type.
-    if (!isa<xegpu::TensorDescType>(resultType)) {
-      LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: "
-                        << resultType << "\n");
-      return failure();
+// template <typename OpTy>
+// class UpdateTensorDescType : public OpConversionPattern<OpTy> {
+// public:
+//   UpdateTensorDescType(MLIRContext *context,
+//                        function_ref<xegpu::LayoutAttr(Value)>
+//                        getLayoutOfValue, TypeConverter &typeConverter,
+//                        PatternBenefit benefit = 1)
+//       : OpConversionPattern<OpTy>(typeConverter, context, benefit),
+//         getLayoutOfValue(getLayoutOfValue) {}
+//   using OpConversionPattern<OpTy>::OpConversionPattern;
+//   LogicalResult
+//   matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor,
+//                   ConversionPatternRewriter &rewriter) const override {
+//     // Op must have single result.
+//     if (op->getNumResults() != 1)
+//       return failure();
+//     Type resultType = op->getResult(0).getType();
+//     // Result type must be a tensor descriptor type.
+//     if (!isa<xegpu::TensorDescType>(resultType)) {
+//       LLVM_DEBUG(DBGS() << "Result type is not a tensor descriptor type: "
+//                         << resultType << "\n");
+//       return failure();
+//     }
+//     auto assignedLayout = getLayoutOfValue(op.getResult());
+//     if (!assignedLayout) {
+//       LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n");
+//       return failure();
+//     }
+//     // Get the original tensor descriptor type.
+//     auto origTensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType);
+//     auto newTensorDescTy = xegpu::TensorDescType::get(
+//         origTensorDescTy.getContext(), origTensorDescTy.getShape(),
+//         origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(),
+//         assignedLayout);
+//     rewriter.replaceOpWithNewOp<OpTy>(op, newTensorDescTy,
+//                                       adaptor.getOperands(), op->getAttrs());
+//     return success();
+//   }
+
+// private:
+//   function_ref<xegpu::LayoutAttr(Value)> getLayoutOfValue;
+// };
+// /// This class is responsible for assigning the layout attributes to the ops
+// and
+// /// their users based on the layout propagation analysis result.
+// class LayoutAttrAssignment {
+// public:
+//   LayoutAttrAssignment(Operation *top,
+//                        function_ref<LayoutInfo(Value)> getLayout)
+//       : getAnalysisResult(getLayout), top(top) {}
+
+//   LogicalResult run();
+
+// private:
+//   LogicalResult assign(Operation *op);
+//   void assignToUsers(Value v, xegpu::LayoutAttr layout);
+//   xegpu::LayoutAttr getLayoutAttrForValue(Value v);
+//   LogicalResult resolveConflicts();
+//   // Callable to get the layout of a value based on the layout propagation
+//   // analysis.
+//   function_ref<LayoutInfo(Value)> getAnalysisResult;
+//   Operation *top;
+// };
+
+// } // namespace
+
+// /// Helper to assign the layout attribute to the users of the value.
+// void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
+//   for (OpOperand &user : v.getUses()) {
+//     Operation *owner = user.getOwner();
+//     unsigned operandNumber = user.getOperandNumber();
+//     // Use a generic name for ease of querying the layout attribute later.
+//     std::string attrName =
+//         operandLayoutNamePrefix + std::to_string(operandNumber);
+//     owner->setAttr(attrName, layout);
+//   }
+// }
+
+// /// Convert the layout assigned to a value to xegpu::LayoutAttr.
+// xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
+//   llvm::errs() << "getLayoutAttrForValue: " << v << "\n";
+//   LayoutInfo layout = getAnalysisResult(v);
+//   if (!layout.isAssigned()) {
+//     llvm::errs() << "No layout assigned for value\n";
+//     return {};
+//   }
+//   SmallVector<int, 2> laneLayout, laneData;
+//   for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+//                                              layout.getDataAsArrayRef())) {
+//     laneLayout.push_back(static_cast<int>(layout));
+//     laneData.push_back(static_cast<int>(data));
+//   }
+//   llvm::errs() << "return layout\n";
+//   return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
+// }
+
+// /// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
+// /// based on the layout propagation analysis result.
+// LogicalResult LayoutAttrAssignment::assign(Operation *op) {
+//   // For function ops, propagate the function argument layout to the users.
+//   if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+//     for (BlockArgument arg : func.getArguments()) {
+//       xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg);
+//       if (layoutInfo) {
+//         assignToUsers(arg, layoutInfo);
+//       }
+//     }
+//     return success();
+//   }
+//   // If no results, move on.
+//   if (op->getNumResults() == 0)
+//     return success();
+//   // If all the results are scalars, move on.
+//   if (llvm::all_of(op->getResultTypes(),
+//                    [](Type t) { return t.isIntOrIndexOrFloat(); }))
+//     return success();
+//   // If the op has more than one result and at least one result is a tensor
+//   // descriptor, exit. This case is not supported yet.
+//   // TODO: Support this case.
+//   if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type
+//   t) {
+//         return isa<xegpu::TensorDescType>(t);
+//       })) {
+//     LLVM_DEBUG(
+//         DBGS() << op->getName()
+//                << " op has more than one result and at least one is a tensor
+//                "
+//                   "descriptor. This case is not handled.\n");
+//     return failure();
+//   }
+//   // If the result is a tensor descriptor, attach the layout to the tensor
+//   // descriptor itself.
+//   if (auto tensorDescTy =
+//           dyn_cast<xegpu::TensorDescType>(op->getResultTypes()[0])) {
+//     xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0));
+//     if (!layoutInfo) {
+//       LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+//       return failure();
+//     }
+
+//     // Clone the op, attach the layout to the result tensor descriptor, and
+//     // remove the original op.
+//     OpBuilder builder(op);
+//     Operation *newOp = builder.clone(*op);
+//     auto newTensorDescTy = xegpu::TensorDescType::get(
+//         tensorDescTy.getContext(), tensorDescTy.getShape(),
+//         tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
+//         layoutInfo);
+//     newOp->getResult(0).setType(newTensorDescTy);
+//     op->replaceAllUsesWith(newOp->getResults());
+//     op->erase();
+//     return success();
+//   }
+//   // Otherwise simply attach the layout to the op itself.
+//   for (auto [i, r] : llvm::enumerate(op->getResults())) {
+//     xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r);
+//     if (layoutInfo) {
+//       std::string attrName = resultLayoutNamePrefix + std::to_string(i);
+//       op->setAttr(attrName, layoutInfo);
+//       // Attach the layout attribute to the users of the result.
+//       assignToUsers(r, layoutInfo);
+//     }
+//   }
+//   return success();
+// }
+
+// /// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
+// LogicalResult LayoutAttrAssignment::run() {
+//   // auto walkResult = top->walk([&](Operation *op) {
+//   //   if (failed(assign(op)))
+//   //     return WalkResult::interrupt();
+//   //   return WalkResult::advance();
+//   // });
+
+//   // if (walkResult.wasInterrupted())
+//   //   return failure();
+//   // apply the UpdateTensorDescType pattern to all ops
+//   // RewritePatternSet patterns(top->getContext());
+//   // patterns.add<UpdateTensorDescType>(
+//   //     top->getContext(), [&](Value v) -> xegpu::LayoutAttr {
+//   //       llvm::errs() << "invoking callback for value\n";
+//   //       return getLayoutAttrForValue(v);
+//   //     });
+//   // if (failed(applyPatternsGreedily(top, std::move(patterns))))
+//   //   return failure();
+
+//   return resolveConflicts();
+// }
+
+// /// TODO: Implement the layout conflict resolution. This must ensure mainly
+// two
+// /// things:
+// /// 1) Is a given layout supported by the op? (need to query the target
+// ///    HW info). Otherwise can we achieve this layout using a layout
+// conversion?
+// /// 2) Do all the operands have the required layout? If not, can it
+// ///    be resolved using a layout conversion?
+// LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
+using GetLayoutCallbackFnTy = function_ref<xegpu::LayoutAttr(Value)>;
+static void handleBranchTerminatorOpInterface(
+    mlir::OpBuilder &builder,
+    mlir::RegionBranchTerminatorOpInterface terminator,
+    GetLayoutCallbackFnTy getLayoutOfValue) {}
+static void handleBranchOpInterface(mlir::OpBuilder &builder,
+                                    mlir::RegionBranchOpInterface branch,
+                                    GetLayoutCallbackFnTy getLayoutOfValue) {}
+static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block,
+                             GetLayoutCallbackFnTy getLayoutOfValue) {}
+static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
+                     GetLayoutCallbackFnTy getLayoutOfValue) {
+
+  auto updateValue = [&](Value v, unsigned vIndex,
+                         const std::string &layoutAttrName) {
+    // Layouts are needed only for vector and tensor descriptor types.
+    if (!isa<VectorType, xegpu::TensorDescType>(v.getType()))
+      return;
+    xegpu::LayoutAttr layout = getLayoutOfValue(v);
+    if (!layout) {
+      // TODO : handle error.
+      LLVM_DEBUG(DBGS() << "Expecting layout for value: " << v
+                        << " but got none.\n");
+      return;
     }
-    auto assignedLayout = getLayoutOfValue(op.getResult());
-    if (!assignedLayout) {
-      LLVM_DEBUG(DBGS() << "No layout assigned for " << *op << "\n");
-      return failure();
+    auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(v.getType());
+
+    if (tensorDescTy) {
+      auto newTensorDescTy = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+      v.setType(newTensorDescTy);
+      return;
     }
-    // Get the original tensor descriptor type.
-    auto origTensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType);
-    auto newTensorDescTy = xegpu::TensorDescType::get(
-        origTensorDescTy.getContext(), origTensorDescTy.getShape(),
-        origTensorDescTy.getElementType(), origTensorDescTy.getEncoding(),
-        assignedLayout);
-    rewriter.replaceOpWithNewOp<OpTy>(op, newTensorDescTy,
-                                      adaptor.getOperands(), op->getAttrs());
-    return success();
-  }
-
-private:
-  function_ref<xegpu::LayoutAttr(Value)> getLayoutOfValue;
-};
-/// This class is responsible for assigning the layout attributes to the ops and
-/// their users based on the layout propagation analysis result.
-class LayoutAttrAssignment {
-public:
-  LayoutAttrAssignment(Operation *top,
-                       function_ref<LayoutInfo(Value)> getLayout)
-      : getAnalysisResult(getLayout), top(top) {}
-
-  LogicalResult run();
-
-private:
-  LogicalResult assign(Operation *op);
-  void assignToUsers(Value v, xegpu::LayoutAttr layout);
-  xegpu::LayoutAttr getLayoutAttrForValue(Value v);
-  LogicalResult resolveConflicts();
-  // Callable to get the layout of a value based on the layout propagation
-  // analysis.
-  function_ref<LayoutInfo(Value)> getAnalysisResult;
-  Operation *top;
-};
-
-} // namespace
-
-/// Helper to assign the layout attribute to the users of the value.
-void LayoutAttrAssignment::assignToUsers(Value v, xegpu::LayoutAttr layout) {
-  for (OpOperand &user : v.getUses()) {
-    Operation *owner = user.getOwner();
-    unsigned operandNumber = user.getOperandNumber();
-    // Use a generic name for ease of querying the layout attribute later.
-    std::string attrName =
-        operandLayoutNamePrefix + std::to_string(operandNumber);
-    owner->setAttr(attrName, layout);
-  }
-}
-
-/// Convert the layout assigned to a value to xegpu::LayoutAttr.
-xegpu::LayoutAttr LayoutAttrAssignment::getLayoutAttrForValue(Value v) {
-  llvm::errs() << "getLayoutAttrForValue: " << v << "\n";
-  LayoutInfo layout = getAnalysisResult(v);
-  if (!layout.isAssigned()) {
-    llvm::errs() << "No layout assigned for value\n";
-    return {};
-  }
-  SmallVector<int, 2> laneLayout, laneData;
-  for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
-                                             layout.getDataAsArrayRef())) {
-    laneLayout.push_back(static_cast<int>(layout));
-    laneData.push_back(static_cast<int>(data));
-  }
-  llvm::errs() << "return layout\n";
-  return xegpu::LayoutAttr::get(v.getContext(), laneLayout, laneData);
-}
+    // If type is vector, add a temporary layout attribute to the op.
+    op->setAttr(layoutAttrName, layout);
+  };
 
-/// Assign xegpu::LayoutAttr to the op and its users. The layout is assigned
-/// based on the layout propagation analysis result.
-LogicalResult LayoutAttrAssignment::assign(Operation *op) {
-  // For function ops, propagate the function argument layout to the users.
-  if (auto func = dyn_cast<FunctionOpInterface>(op)) {
-    for (BlockArgument arg : func.getArguments()) {
-      xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(arg);
-      if (layoutInfo) {
-        assignToUsers(arg, layoutInfo);
-      }
-    }
-    return success();
-  }
-  // If no results, move on.
-  if (op->getNumResults() == 0)
-    return success();
-  // If all the results are scalars, move on.
-  if (llvm::all_of(op->getResultTypes(),
-                   [](Type t) { return t.isIntOrIndexOrFloat(); }))
-    return success();
-  // If the op has more than one result and at least one result is a tensor
-  // descriptor, exit. This case is not supported yet.
-  // TODO: Support this case.
-  if (op->getNumResults() > 1 && llvm::any_of(op->getResultTypes(), [](Type t) {
-        return isa<xegpu::TensorDescType>(t);
-      })) {
-    LLVM_DEBUG(
-        DBGS() << op->getName()
-               << " op has more than one result and at least one is a tensor "
-                  "descriptor. This case is not handled.\n");
-    return failure();
+  // Iterate over all the operands.
+  for (OpOperand &operand : op->getOpOperands()) {
+    unsigned operandIndex = operand.getOperandNumber();
+    std::string operandLayoutName =
+        operandLayoutNamePrefix + std::to_string(operandIndex);
+    updateValue(operand.get(), operandIndex, operandLayoutName);
   }
-  // If the result is a tensor descriptor, attach the layout to the tensor
-  // descriptor itself.
-  if (auto tensorDescTy =
-          dyn_cast<xegpu::TensorDescType>(op->getResultTypes()[0])) {
-    xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(op->getResult(0));
-    if (!layoutInfo) {
-      LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
-      return failure();
-    }
 
-    // Clone the op, attach the layout to the result tensor descriptor, and
-    // remove the original op.
-    OpBuilder builder(op);
-    Operation *newOp = builder.clone(*op);
-    auto newTensorDescTy = xegpu::TensorDescType::get(
-        tensorDescTy.getContext(), tensorDescTy.getShape(),
-        tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layoutInfo);
-    newOp->getResult(0).setType(newTensorDescTy);
-    op->replaceAllUsesWith(newOp->getResults());
-    op->erase();
-    return success();
+  // Iterate over all the results.
+  for (OpResult result : op->getResults()) {
+    unsigned resultIndex = result.getResultNumber();
+    std::string resultLayoutName =
+        resultLayoutNamePrefix + std::to_string(resultIndex);
+    updateValue(result, resultIndex, resultLayoutName);
   }
-  // Otherwise simply attach the layout to the op itself.
-  for (auto [i, r] : llvm::enumerate(op->getResults())) {
-    xegpu::LayoutAttr layoutInfo = getLayoutAttrForValue(r);
-    if (layoutInfo) {
-      std::string attrName = resultLayoutNamePrefix + std::to_string(i);
-      op->setAttr(attrName, layoutInfo);
-      // Attach the layout attribute to the users of the result.
-      assignToUsers(r, layoutInfo);
-    }
-  }
-  return success();
 }
 
-/// Walk the IR and attach xegpu::LayoutAttr to all ops and their users.
-LogicalResult LayoutAttrAssignment::run() {
-  // auto walkResult = top->walk([&](Operation *op) {
-  //   if (failed(assign(op)))
-  //     return WalkResult::interrupt();
-  //   return WalkResult::advance();
-  // });
-
-  // if (walkResult.wasInterrupted())
-  //   return failure();
-  // apply the UpdateTensorDescType pattern to all ops
-  // RewritePatternSet patterns(top->getContext());
-  // patterns.add<UpdateTensorDescType>(
-  //     top->getContext(), [&](Value v) -> xegpu::LayoutAttr {
-  //       llvm::errs() << "invoking callback for value\n";
-  //       return getLayoutAttrForValue(v);
-  //     });
-  // if (failed(applyPatternsGreedily(top, std::move(patterns))))
-  //   return failure();
-
-  return resolveConflicts();
-}
-
-/// TODO: Implement the layout conflict resolution. This must ensure mainly two
-/// things:
-/// 1) Is a given layout supported by the op? (need to query the target
-///    HW info). Otherwise can we achieve this layout using a layout conversion?
-/// 2) Do all the operands have the required layout? If not, can it
-///    be resolved using a layout conversion?
-LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
-
 namespace {
 
 //===----------------------------------------------------------------------===//
@@ -1657,10 +1719,10 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   // auto getPropagatedLayout = [&](Value val) {
   //   return analyis.getLayoutInfo(val);
   // };
-  auto getXeGpuLayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
+  auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
     LayoutInfo layout = analyis.getLayoutInfo(val);
     if (!layout.isAssigned()) {
-      llvm::errs() << "No layout assigned for value\n";
+      llvm::errs() << "No layout assigned for value" << val << "\n";
       return {};
     }
     SmallVector<int, 2> laneLayout, laneData;
@@ -1672,41 +1734,26 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     return xegpu::LayoutAttr::get(val.getContext(), laneLayout, laneData);
   };
 
-  ConversionTarget target(getContext());
-  target.addDynamicallyLegalOp<xegpu::CreateNdDescOp, xegpu::UpdateNdOffsetOp>(
-      [&](Operation *op) {
-        return llvm::all_of(op->getResults(), [&](Value val) {
-          if (auto descType = dyn_cast<xegpu::TensorDescType>(val.getType())) {
-            return descType.getLayoutAttr() != nullptr;
-          }
-          return true; // Non-tensor descriptor types are always legal.
-        });
-      });
-  target.addLegalOp<UnrealizedConversionCastOp>();
-  TypeConverter typeConverter;
-  typeConverter.addConversion([](Type type) { return type; });
-  // // typeConverter.addConversion([](xegpu::TensorDescType type) {
-  // //   return xegpu::TensorDescType::get(
-  // //       type.getContext(), type.getShape(), type.getElementType(),
-  // //       type.getEncoding(),
-  // //       xegpu::LayoutAttr::get(type.getContext(), {1, 1}, {1, 1}));
-  // // });
-  auto addUnrealizedCast = [](OpBuilder &builder, Type type, ValueRange inputs,
-                              Location loc) -> Value {
-    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
-    return cast.getResult(0);
-  };
+  mlir::OpBuilder builder(&getContext());
+  Operation *op = getOperation();
+  op->walk([&](mlir::Block *block) {
+    for (mlir::Operation &op : llvm::reverse(block->getOperations())) {
+      if (auto terminator =
+              mlir::dyn_cast<mlir::RegionBranchTerminatorOpInterface>(op)) {
+        handleBranchTerminatorOpInterface(builder, terminator,
+                                          getXeGPULayoutForValue);
+        continue;
+      }
 
-  typeConverter.addSourceMaterialization(addUnrealizedCast);
-  typeConverter.addTargetMaterialization(addUnrealizedCast);
+      if (auto iface = mlir::dyn_cast<mlir::RegionBranchOpInterface>(op)) {
+        handleBranchOpInterface(builder, iface, getXeGPULayoutForValue);
+        continue;
+      }
+      updateOp(builder, &op, getXeGPULayoutForValue);
+    }
 
-  RewritePatternSet patterns(&getContext());
-  patterns.add<UpdateTensorDescType<xegpu::CreateNdDescOp>,
-               UpdateTensorDescType<xegpu::UpdateNdOffsetOp>>(
-      &getContext(), getXeGpuLayoutForValue, typeConverter);
-  if (failed(
-          applyPartialConversion(getOperation(), target, std::move(patterns))))
-    signalPassFailure();
+    updateBlockTypes(builder, *block, getXeGPULayoutForValue);
+  });
 
   // Assign xegpu::LayoutAttr to all ops and their users based on the layout
   // propagation analysis result.

>From 7d54194f0c726db4461015de87abf9ad380bbfa3 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 3 Jun 2025 19:50:30 +0000
Subject: [PATCH 07/10] working version

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 159 +++++++++++++-----
 1 file changed, 120 insertions(+), 39 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index aa982ae779d1e..6b3ff8312e365 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -40,6 +40,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/InterleavedRange.h"
 #include "llvm/Support/LogicalResult.h"
@@ -905,59 +906,140 @@ void RunLayoutInfoPropagation::printAnalysisResult(llvm::raw_ostream &os) {
 // ///    be resolved using a layout conversion?
 // LogicalResult LayoutAttrAssignment::resolveConflicts() { return success(); }
 using GetLayoutCallbackFnTy = function_ref<xegpu::LayoutAttr(Value)>;
+static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
+                     GetLayoutCallbackFnTy getLayoutOfValue) {
+
+  // Iterate over all the results.
+  for (OpResult result : op->getResults()) {
+    Type resultType = result.getType();
+    // Layouts are needed only for vector and tensor descriptor types.
+    if (!isa<VectorType, xegpu::TensorDescType>(resultType))
+      continue;
+    // If the result has any users, we expect it to have a layout.
+    xegpu::LayoutAttr layout = getLayoutOfValue(result);
+    if (!layout && result.getNumUses() > 0) {
+      LLVM_DEBUG(DBGS() << "Expecting layout for result: " << result
+                        << " but got none.\n");
+      continue;
+    }
+    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+      // TODO: Handle error.
+      auto typeWithLayout = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+      result.setType(typeWithLayout);
+      continue;
+    }
+    // If the result is a vector type, add a temporary layout attribute to the
+    // op.
+    std::string resultLayoutName =
+        resultLayoutNamePrefix + std::to_string(result.getResultNumber());
+    op->setAttr(resultLayoutName, layout);
+    // Update all users of the result with the layout.
+    for (OpOperand &user : result.getUses()) {
+      Operation *owner = user.getOwner();
+      unsigned operandNumber = user.getOperandNumber();
+      // Add temorary layout attribute at the user op.
+      std::string attrName =
+          operandLayoutNamePrefix + std::to_string(operandNumber);
+      owner->setAttr(attrName, layout);
+    }
+  }
+}
 static void handleBranchTerminatorOpInterface(
     mlir::OpBuilder &builder,
     mlir::RegionBranchTerminatorOpInterface terminator,
     GetLayoutCallbackFnTy getLayoutOfValue) {}
 static void handleBranchOpInterface(mlir::OpBuilder &builder,
                                     mlir::RegionBranchOpInterface branch,
-                                    GetLayoutCallbackFnTy getLayoutOfValue) {}
-static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block,
-                             GetLayoutCallbackFnTy getLayoutOfValue) {}
-static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
-                     GetLayoutCallbackFnTy getLayoutOfValue) {
+                                    GetLayoutCallbackFnTy getLayoutOfValue) {
+  mlir::Operation *op = branch.getOperation();
+  llvm::SmallVector<mlir::RegionSuccessor> successors;
+  llvm::SmallVector<mlir::Attribute> operands(op->getNumOperands(), nullptr);
+  branch.getEntrySuccessorRegions(operands, successors);
+  DenseMap<Value, xegpu::LayoutAttr> resultToLayouts;
+  mlir::ValueRange results = op->getResults();
+
+  for (mlir::RegionSuccessor &successor : successors) {
+    if (successor.isParent())
+      continue;
 
-  auto updateValue = [&](Value v, unsigned vIndex,
-                         const std::string &layoutAttrName) {
-    // Layouts are needed only for vector and tensor descriptor types.
-    if (!isa<VectorType, xegpu::TensorDescType>(v.getType()))
-      return;
-    xegpu::LayoutAttr layout = getLayoutOfValue(v);
+    mlir::OperandRange initArgs = branch.getEntrySuccessorOperands(successor);
+    mlir::ValueRange blockArgs = successor.getSuccessorInputs();
+    unsigned index = 0;
+
+    for (auto [initArg, blockArg, result] :
+         llvm::zip(initArgs, blockArgs, results)) {
+      Type inputType = blockArg.getType();
+      if (!isa<xegpu::TensorDescType>(inputType))
+        continue;
+      xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(blockArg);
+      xegpu::LayoutAttr initArgLayout = getLayoutOfValue(initArg);
+
+      if (!blockArgLayout || !initArgLayout) {
+        LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << blockArg
+                          << " or init arg: " << initArg << "\n");
+        continue;
+      }
+
+      // TOOD: We expect these two to match. Data flow analysis will ensure
+      // this.
+      assert(blockArgLayout == initArgLayout &&
+             "Expexing block arg and init arg to have the same layout.");
+      // Get tensor descriptor type with the layout.
+      auto tdescTy = dyn_cast<xegpu::TensorDescType>(inputType);
+      auto newTdescTy = xegpu::TensorDescType::get(
+          tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
+          tdescTy.getEncoding(), blockArgLayout);
+      blockArg.setType(newTdescTy);
+      // Store the layout for the result.
+      if (resultToLayouts.count(result) != 0 &&
+          resultToLayouts[result] != blockArgLayout) {
+        LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result
+                          << " - " << resultToLayouts[result] << " vs "
+                          << blockArgLayout << "\n");
+      } else {
+        resultToLayouts[result] = blockArgLayout;
+      }
+    }
+  }
+  for (auto [i, r] : llvm::enumerate(op->getResults())) {
+    Type resultType = r.getType();
+    if (!isa<xegpu::TensorDescType, VectorType>(resultType))
+      continue;
+    xegpu::LayoutAttr layout = getLayoutOfValue(r);
+    if (!layout)
+      layout = resultToLayouts[r];
     if (!layout) {
-      // TODO : handle error.
-      LLVM_DEBUG(DBGS() << "Expecting layout for value: " << v
-                        << " but got none.\n");
-      return;
+      LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result: "
+                        << r << "\n");
+      continue;
     }
-    auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(v.getType());
-
-    if (tensorDescTy) {
-      auto newTensorDescTy = xegpu::TensorDescType::get(
+    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+      auto newTdescTy = xegpu::TensorDescType::get(
           tensorDescTy.getContext(), tensorDescTy.getShape(),
           tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
-      v.setType(newTensorDescTy);
-      return;
+      r.setType(newTdescTy);
+      continue;
     }
-    // If type is vector, add a temporary layout attribute to the op.
-    op->setAttr(layoutAttrName, layout);
-  };
-
-  // Iterate over all the operands.
-  for (OpOperand &operand : op->getOpOperands()) {
-    unsigned operandIndex = operand.getOperandNumber();
-    std::string operandLayoutName =
-        operandLayoutNamePrefix + std::to_string(operandIndex);
-    updateValue(operand.get(), operandIndex, operandLayoutName);
-  }
-
-  // Iterate over all the results.
-  for (OpResult result : op->getResults()) {
-    unsigned resultIndex = result.getResultNumber();
+    // If the result is a vector type, add a temporary layout attribute to the
+    // op.
     std::string resultLayoutName =
-        resultLayoutNamePrefix + std::to_string(resultIndex);
-    updateValue(result, resultIndex, resultLayoutName);
+        resultLayoutNamePrefix + std::to_string(r.getResultNumber());
+    op->setAttr(resultLayoutName, layout);
+    // Update all users of the result with the layout.
+    for (OpOperand &user : r.getUses()) {
+      Operation *owner = user.getOwner();
+      unsigned operandNumber = user.getOperandNumber();
+      // Add temporary layout attribute at the user op.
+      std::string attrName =
+          operandLayoutNamePrefix + std::to_string(operandNumber);
+      owner->setAttr(attrName, layout);
+    }
   }
 }
+static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block,
+                             GetLayoutCallbackFnTy getLayoutOfValue) {}
 
 namespace {
 
@@ -1722,7 +1804,6 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
     LayoutInfo layout = analyis.getLayoutInfo(val);
     if (!layout.isAssigned()) {
-      llvm::errs() << "No layout assigned for value" << val << "\n";
       return {};
     }
     SmallVector<int, 2> laneLayout, laneData;

>From b289399e44bf56e91149cbfc37a729c14949c4d2 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 3 Jun 2025 21:36:11 +0000
Subject: [PATCH 08/10] working expect for unreal cast

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 97 +++++++++----------
 1 file changed, 46 insertions(+), 51 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 6b3ff8312e365..dfb7b0668d2be 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1291,11 +1291,14 @@ struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
     xegpu::TensorDescType distributedTensorDescTy =
         descOp.getType().dropLayouts(); // Distributed tensor descriptor type
                                         // does not contain layout info.
-    auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+    Value newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
         newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
         descOp->getAttrs());
 
     Value distributedVal = newWarpOp.getResult(operandIdx);
+    // Resolve the distributed type to the expected type.
+    newDescOp =
+        resolveDistributedTy(newDescOp, distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newDescOp);
     return success();
   }
@@ -1697,10 +1700,13 @@ struct UpdateNdOffsetDistribution final : public gpu::WarpDistributionPattern {
       }
     }
     // Create a new update op outside the warp op.
-    auto newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
+    Value newUpdateOp = rewriter.create<xegpu::UpdateNdOffsetOp>(
         newWarpOp.getLoc(), newTensorDescTy, newUpdateOperands,
         removeTemporaryLayoutAttributes(updateOp->getAttrs()));
     Value distributedVal = newWarpOp.getResult(operandIdx);
+    // Resolve the distributed type with the original type.
+    newUpdateOp =
+        resolveDistributedTy(newUpdateOp, distributedVal.getType(), rewriter);
     rewriter.replaceAllUsesWith(distributedVal, newUpdateOp);
     return success();
   }
@@ -1836,55 +1842,44 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     updateBlockTypes(builder, *block, getXeGPULayoutForValue);
   });
 
-  // Assign xegpu::LayoutAttr to all ops and their users based on the layout
-  // propagation analysis result.
-  // LayoutAttrAssignment layoutAssignment(getOperation(), getPropagatedLayout);
-  // if (failed(layoutAssignment.run())) {
-  //   signalPassFailure();
-  //   return;
-  // }
-
   // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
   // operation.
-  // {
-  //   RewritePatternSet patterns(&getContext());
-  //   patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-
-  //   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-  //     signalPassFailure();
-  //     return;
-  //   }
-  //   // At this point, we have moved the entire function body inside the
-  //   warpOp.
-  //   // Now move any scalar uniform code outside of the warpOp (like GPU index
-  //   // ops, scalar constants, etc.). This will simplify the later lowering
-  //   and
-  //   // avoid custom patterns for these ops.
-  //   getOperation()->walk([&](Operation *op) {
-  //     if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
-  //       vector::moveScalarUniformCode(warpOp);
-  //     }
-  //   });
-  // }
-  // // Finally, do the SIMD to SIMT distribution.
-  // RewritePatternSet patterns(&getContext());
-  // xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  // // TODO: distributionFn and shuffleFn are not used at this point.
-  // auto distributionFn = [](Value val) {
-  //   VectorType vecType = dyn_cast<VectorType>(val.getType());
-  //   int64_t vecRank = vecType ? vecType.getRank() : 0;
-  //   OpBuilder builder(val.getContext());
-  //   if (vecRank == 0)
-  //     return AffineMap::get(val.getContext());
-  //   return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
-  // };
-  // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value
-  // srcIdx,
-  //                     int64_t warpSz) { return Value(); };
-  // vector::populatePropagateWarpVectorDistributionPatterns(
-  //     patterns, distributionFn, shuffleFn);
-  // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-  //   signalPassFailure();
-  //   return;
-  // }
+  {
+    RewritePatternSet patterns(&getContext());
+    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+
+    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+      signalPassFailure();
+      return;
+    }
+    // At this point, we have moved the entire function body inside the
+    // warpOp. Now move any scalar uniform code outside of the warpOp (like GPU
+    // index ops, scalar constants, etc.). This will simplify the later lowering
+    // and avoid custom patterns for these ops.
+    getOperation()->walk([&](Operation *op) {
+      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
+        vector::moveScalarUniformCode(warpOp);
+      }
+    });
+  }
+  // Finally, do the SIMD to SIMT distribution.
+  RewritePatternSet patterns(&getContext());
+  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+  // TODO: distributionFn and shuffleFn are not used at this point.
+  auto distributionFn = [](Value val) {
+    VectorType vecType = dyn_cast<VectorType>(val.getType());
+    int64_t vecRank = vecType ? vecType.getRank() : 0;
+    OpBuilder builder(val.getContext());
+    if (vecRank == 0)
+      return AffineMap::get(val.getContext());
+    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+  };
+  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
+                      int64_t warpSz) { return Value(); };
+  vector::populatePropagateWarpVectorDistributionPatterns(
+      patterns, distributionFn, shuffleFn);
+  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+    signalPassFailure();
+    return;
+  }
 }

>From 4318343ead59cda8f70741ca45e9255a6ce66bba Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Tue, 3 Jun 2025 22:57:01 +0000
Subject: [PATCH 09/10] some fixes

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 70 ++++++++++++++++---
 1 file changed, 60 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index dfb7b0668d2be..56ec1eaa118e5 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -68,8 +68,14 @@ constexpr unsigned packedSizeInBitsForDefault =
     16; // Minimum packing size per register for DPAS A.
 constexpr unsigned packedSizeInBitsForDpasB =
     32; // Minimum packing size per register for DPAS B.
-static const char *const operandLayoutNamePrefix = "layout_operand_";
-static const char *const resultLayoutNamePrefix = "layout_result_";
+static const char *const operandLayoutNamePrefix =
+    "layout_operand_"; // Attribute name for identifying operand layouts.
+static const char *const resultLayoutNamePrefix =
+    "layout_result_"; // Attribute name for identifying result layouts.
+static const char *const resolveSIMTTypeMismatch =
+    "resolve_simt_type_mismatch"; // Attribute name for identifying
+                                  // UnrelizedConversionCastOp added to resolve
+                                  // SIMT type mismatches.
 
 namespace {
 
@@ -946,11 +952,11 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
     }
   }
 }
-static void handleBranchTerminatorOpInterface(
+static void updateBranchTerminatorOpInterface(
     mlir::OpBuilder &builder,
     mlir::RegionBranchTerminatorOpInterface terminator,
     GetLayoutCallbackFnTy getLayoutOfValue) {}
-static void handleBranchOpInterface(mlir::OpBuilder &builder,
+static void updateBranchOpInterface(mlir::OpBuilder &builder,
                                     mlir::RegionBranchOpInterface branch,
                                     GetLayoutCallbackFnTy getLayoutOfValue) {
   mlir::Operation *op = branch.getOperation();
@@ -966,7 +972,6 @@ static void handleBranchOpInterface(mlir::OpBuilder &builder,
 
     mlir::OperandRange initArgs = branch.getEntrySuccessorOperands(successor);
     mlir::ValueRange blockArgs = successor.getSuccessorInputs();
-    unsigned index = 0;
 
     for (auto [initArg, blockArg, result] :
          llvm::zip(initArgs, blockArgs, results)) {
@@ -1117,6 +1122,7 @@ static Value resolveDistributedTy(Value orig, T expected,
   if (isa<xegpu::TensorDescType>(orig.getType())) {
     auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
                                                               expected, orig);
+    castOp->setAttr(resolveSIMTTypeMismatch, rewriter.getUnitAttr());
     return castOp.getResult(0);
   }
   llvm_unreachable("Unsupported type for reconciliation");
@@ -1804,9 +1810,7 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     analyis.printAnalysisResult(os);
     return;
   }
-  // auto getPropagatedLayout = [&](Value val) {
-  //   return analyis.getLayoutInfo(val);
-  // };
+
   auto getXeGPULayoutForValue = [&](Value val) -> xegpu::LayoutAttr {
     LayoutInfo layout = analyis.getLayoutInfo(val);
     if (!layout.isAssigned()) {
@@ -1827,13 +1831,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     for (mlir::Operation &op : llvm::reverse(block->getOperations())) {
       if (auto terminator =
               mlir::dyn_cast<mlir::RegionBranchTerminatorOpInterface>(op)) {
-        handleBranchTerminatorOpInterface(builder, terminator,
+        updateBranchTerminatorOpInterface(builder, terminator,
                                           getXeGPULayoutForValue);
         continue;
       }
 
       if (auto iface = mlir::dyn_cast<mlir::RegionBranchOpInterface>(op)) {
-        handleBranchOpInterface(builder, iface, getXeGPULayoutForValue);
+        updateBranchOpInterface(builder, iface, getXeGPULayoutForValue);
         continue;
       }
       updateOp(builder, &op, getXeGPULayoutForValue);
@@ -1882,4 +1886,50 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     signalPassFailure();
     return;
   }
+
+  // Clean up UnrealizedConversionCastOps that were inserted due to tensor desc
+  // type mismatches created by using upstream distribution patterns (scf.for)
+  getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
+    // We are only interested in UnrealizedConversionCastOps there were added
+    // for resolving SIMT type mismatches.
+    if (!op->getAttr(resolveSIMTTypeMismatch))
+      return WalkResult::skip();
+
+    Value input = op.getOperand(0);
+    Value output = op.getResult(0);
+
+    // Both input and output must have tensor descriptor types.
+    xegpu::TensorDescType inputDescType =
+        mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
+    xegpu::TensorDescType outputDescType =
+        mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
+    assert(inputDescType && outputDescType &&
+           "Unrealized conversion cast must have tensor descriptor types");
+
+    // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
+    // This occurs iside scf.for body to resolve the block argument type to SIMT
+    // type.
+    if (inputDescType.getLayout()) {
+      auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
+      if (argument) {
+        argument.setType(output.getType());
+        output.replaceAllUsesWith(argument);
+        if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
+                argument.getOwner()->getParentOp())) {
+          auto result = loopOp.getTiedLoopResult(argument);
+          result.setType(output.getType());
+        }
+      }
+    }
+
+    // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
+    // conversions. This occurs at the yield op of scf.for body to go back from
+    // SIMT type to original type.
+    if (outputDescType.getLayout())
+      output.replaceAllUsesWith(input);
+
+    if (op->use_empty())
+      op->erase();
+    return WalkResult::advance();
+  });
 }

>From 20a641545534132b59c934d7bc31b6c088134605 Mon Sep 17 00:00:00 2001
From: Charitha Saumya <charitha.saumya.gusthinna.waduge at intel.com>
Date: Wed, 4 Jun 2025 00:01:15 +0000
Subject: [PATCH 10/10] branch terminator iface

---
 .../Transforms/XeGPUSubgroupDistribute.cpp    | 332 ++++++++++--------
 1 file changed, 193 insertions(+), 139 deletions(-)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 56ec1eaa118e5..27d912b87c6dc 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -955,7 +955,54 @@ static void updateOp(mlir::OpBuilder &builder, mlir::Operation *op,
 static void updateBranchTerminatorOpInterface(
     mlir::OpBuilder &builder,
     mlir::RegionBranchTerminatorOpInterface terminator,
-    GetLayoutCallbackFnTy getLayoutOfValue) {}
+    GetLayoutCallbackFnTy getLayoutOfValue) {
+  if (!mlir::isa<mlir::RegionBranchOpInterface>(terminator->getParentOp()))
+    return;
+
+  llvm::SmallVector<mlir::RegionSuccessor> successors;
+  llvm::SmallVector<mlir::Attribute> operands(terminator->getNumOperands(),
+                                              nullptr);
+  terminator.getSuccessorRegions(operands, successors);
+
+  for (mlir::RegionSuccessor &successor : successors) {
+    if (!successor.isParent())
+      continue;
+
+    mlir::OperandRange operands = terminator.getSuccessorOperands(successor);
+    mlir::ValueRange inputs = successor.getSuccessorInputs();
+    for (auto [operand, input] : llvm::zip(operands, inputs)) {
+      // print arg and inp
+      // llvm::errs() << "arg: " << operand << ", inp: " << input << "\n";
+      Type inputType = input.getType();
+      if (!isa<xegpu::TensorDescType>(inputType))
+        continue;
+      xegpu::LayoutAttr inputLayout = getLayoutOfValue(input);
+      xegpu::LayoutAttr operandLayout = getLayoutOfValue(operand);
+
+      if (!operandLayout) {
+        LLVM_DEBUG(DBGS() << "Expecting layout for region successor operand : "
+                          << operand << " but got none.\n");
+        continue;
+      }
+
+      if (inputLayout && inputLayout != operandLayout) {
+        LLVM_DEBUG(
+            DBGS()
+            << "Conflicting layouts for region successor operand and input: "
+            << inputLayout << " vs " << operandLayout << "\n");
+        continue;
+      }
+      llvm::errs() << "Setting layout for input to "
+                   << ": " << operandLayout << "\n";
+      // Get tensor descriptor type with the layout.
+      auto tdescTy = dyn_cast<xegpu::TensorDescType>(inputType);
+      auto newTdescTy = xegpu::TensorDescType::get(
+          tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
+          tdescTy.getEncoding(), operandLayout);
+      input.setType(newTdescTy);
+    }
+  }
+}
 static void updateBranchOpInterface(mlir::OpBuilder &builder,
                                     mlir::RegionBranchOpInterface branch,
                                     GetLayoutCallbackFnTy getLayoutOfValue) {
@@ -970,20 +1017,19 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder,
     if (successor.isParent())
       continue;
 
-    mlir::OperandRange initArgs = branch.getEntrySuccessorOperands(successor);
-    mlir::ValueRange blockArgs = successor.getSuccessorInputs();
+    mlir::OperandRange operands = branch.getEntrySuccessorOperands(successor);
+    mlir::ValueRange inputs = successor.getSuccessorInputs();
 
-    for (auto [initArg, blockArg, result] :
-         llvm::zip(initArgs, blockArgs, results)) {
-      Type inputType = blockArg.getType();
+    for (auto [operand, input, result] : llvm::zip(operands, inputs, results)) {
+      Type inputType = input.getType();
       if (!isa<xegpu::TensorDescType>(inputType))
         continue;
-      xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(blockArg);
-      xegpu::LayoutAttr initArgLayout = getLayoutOfValue(initArg);
+      xegpu::LayoutAttr blockArgLayout = getLayoutOfValue(input);
+      xegpu::LayoutAttr initArgLayout = getLayoutOfValue(operand);
 
       if (!blockArgLayout || !initArgLayout) {
-        LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << blockArg
-                          << " or init arg: " << initArg << "\n");
+        LLVM_DEBUG(DBGS() << "No layout assigned for block arg: " << input
+                          << " or init arg: " << operand << "\n");
         continue;
       }
 
@@ -996,52 +1042,54 @@ static void updateBranchOpInterface(mlir::OpBuilder &builder,
       auto newTdescTy = xegpu::TensorDescType::get(
           tdescTy.getContext(), tdescTy.getShape(), tdescTy.getElementType(),
           tdescTy.getEncoding(), blockArgLayout);
-      blockArg.setType(newTdescTy);
+      input.setType(newTdescTy);
       // Store the layout for the result.
-      if (resultToLayouts.count(result) != 0 &&
-          resultToLayouts[result] != blockArgLayout) {
-        LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result
-                          << " - " << resultToLayouts[result] << " vs "
-                          << blockArgLayout << "\n");
-      } else {
-        resultToLayouts[result] = blockArgLayout;
-      }
-    }
-  }
-  for (auto [i, r] : llvm::enumerate(op->getResults())) {
-    Type resultType = r.getType();
-    if (!isa<xegpu::TensorDescType, VectorType>(resultType))
-      continue;
-    xegpu::LayoutAttr layout = getLayoutOfValue(r);
-    if (!layout)
-      layout = resultToLayouts[r];
-    if (!layout) {
-      LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result: "
-                        << r << "\n");
-      continue;
-    }
-    if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
-      auto newTdescTy = xegpu::TensorDescType::get(
-          tensorDescTy.getContext(), tensorDescTy.getShape(),
-          tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
-      r.setType(newTdescTy);
-      continue;
-    }
-    // If the result is a vector type, add a temporary layout attribute to the
-    // op.
-    std::string resultLayoutName =
-        resultLayoutNamePrefix + std::to_string(r.getResultNumber());
-    op->setAttr(resultLayoutName, layout);
-    // Update all users of the result with the layout.
-    for (OpOperand &user : r.getUses()) {
-      Operation *owner = user.getOwner();
-      unsigned operandNumber = user.getOperandNumber();
-      // Add temporary layout attribute at the user op.
-      std::string attrName =
-          operandLayoutNamePrefix + std::to_string(operandNumber);
-      owner->setAttr(attrName, layout);
+      // if (resultToLayouts.count(result) != 0 &&
+      //     resultToLayouts[result] != blockArgLayout) {
+      //   LLVM_DEBUG(DBGS() << "Conflicting layouts for result: " << result
+      //                     << " - " << resultToLayouts[result] << " vs "
+      //                     << blockArgLayout << "\n");
+      // } else {
+      //   resultToLayouts[result] = blockArgLayout;
+      // }
     }
   }
+  // for (auto [i, r] : llvm::enumerate(op->getResults())) {
+  //   Type resultType = r.getType();
+  //   if (!isa<xegpu::TensorDescType, VectorType>(resultType))
+  //     continue;
+  //   xegpu::LayoutAttr layout = getLayoutOfValue(r);
+  //   if (!layout)
+  //     layout = resultToLayouts[r];
+  //   if (!layout) {
+  //     LLVM_DEBUG(DBGS() << "No layout assigned for vector/tensor desc result:
+  //     "
+  //                       << r << "\n");
+  //     continue;
+  //   }
+  //   if (auto tensorDescTy = dyn_cast<xegpu::TensorDescType>(resultType)) {
+  //     auto newTdescTy = xegpu::TensorDescType::get(
+  //         tensorDescTy.getContext(), tensorDescTy.getShape(),
+  //         tensorDescTy.getElementType(), tensorDescTy.getEncoding(), layout);
+  //     r.setType(newTdescTy);
+  //     continue;
+  //   }
+  //   // If the result is a vector type, add a temporary layout attribute to
+  //   the
+  //   // op.
+  //   std::string resultLayoutName =
+  //       resultLayoutNamePrefix + std::to_string(r.getResultNumber());
+  //   op->setAttr(resultLayoutName, layout);
+  //   // Update all users of the result with the layout.
+  //   for (OpOperand &user : r.getUses()) {
+  //     Operation *owner = user.getOwner();
+  //     unsigned operandNumber = user.getOperandNumber();
+  //     // Add temporary layout attribute at the user op.
+  //     std::string attrName =
+  //         operandLayoutNamePrefix + std::to_string(operandNumber);
+  //     owner->setAttr(attrName, layout);
+  //   }
+  // }
 }
 static void updateBlockTypes(mlir::OpBuilder &builder, mlir::Block &block,
                              GetLayoutCallbackFnTy getLayoutOfValue) {}
@@ -1846,90 +1894,96 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
     updateBlockTypes(builder, *block, getXeGPULayoutForValue);
   });
 
-  // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
-  // operation.
-  {
-    RewritePatternSet patterns(&getContext());
-    patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
-
-    if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-      signalPassFailure();
-      return;
-    }
-    // At this point, we have moved the entire function body inside the
-    // warpOp. Now move any scalar uniform code outside of the warpOp (like GPU
-    // index ops, scalar constants, etc.). This will simplify the later lowering
-    // and avoid custom patterns for these ops.
-    getOperation()->walk([&](Operation *op) {
-      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
-        vector::moveScalarUniformCode(warpOp);
-      }
-    });
-  }
-  // Finally, do the SIMD to SIMT distribution.
-  RewritePatternSet patterns(&getContext());
-  xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
-  // TODO: distributionFn and shuffleFn are not used at this point.
-  auto distributionFn = [](Value val) {
-    VectorType vecType = dyn_cast<VectorType>(val.getType());
-    int64_t vecRank = vecType ? vecType.getRank() : 0;
-    OpBuilder builder(val.getContext());
-    if (vecRank == 0)
-      return AffineMap::get(val.getContext());
-    return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
-  };
-  auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value srcIdx,
-                      int64_t warpSz) { return Value(); };
-  vector::populatePropagateWarpVectorDistributionPatterns(
-      patterns, distributionFn, shuffleFn);
-  if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
-    signalPassFailure();
-    return;
-  }
-
-  // Clean up UnrealizedConversionCastOps that were inserted due to tensor desc
-  // type mismatches created by using upstream distribution patterns (scf.for)
-  getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
-    // We are only interested in UnrealizedConversionCastOps there were added
-    // for resolving SIMT type mismatches.
-    if (!op->getAttr(resolveSIMTTypeMismatch))
-      return WalkResult::skip();
-
-    Value input = op.getOperand(0);
-    Value output = op.getResult(0);
-
-    // Both input and output must have tensor descriptor types.
-    xegpu::TensorDescType inputDescType =
-        mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
-    xegpu::TensorDescType outputDescType =
-        mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
-    assert(inputDescType && outputDescType &&
-           "Unrealized conversion cast must have tensor descriptor types");
-
-    // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
-    // This occurs iside scf.for body to resolve the block argument type to SIMT
-    // type.
-    if (inputDescType.getLayout()) {
-      auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
-      if (argument) {
-        argument.setType(output.getType());
-        output.replaceAllUsesWith(argument);
-        if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
-                argument.getOwner()->getParentOp())) {
-          auto result = loopOp.getTiedLoopResult(argument);
-          result.setType(output.getType());
-        }
-      }
-    }
-
-    // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
-    // conversions. This occurs at the yield op of scf.for body to go back from
-    // SIMT type to original type.
-    if (outputDescType.getLayout())
-      output.replaceAllUsesWith(input);
-
-    if (op->use_empty())
-      op->erase();
-    return WalkResult::advance();
-  });
+  // // Move all operations of a GPU function inside gpu.warp_execute_on_lane_0
+  // // operation.
+  // {
+  //   RewritePatternSet patterns(&getContext());
+  //   patterns.add<MoveFuncBodyToWarpExecuteOnLane0>(&getContext());
+
+  //   if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  //     signalPassFailure();
+  //     return;
+  //   }
+  //   // At this point, we have moved the entire function body inside the
+  //   // warpOp. Now move any scalar uniform code outside of the warpOp (like
+  //   GPU
+  //   // index ops, scalar constants, etc.). This will simplify the later
+  //   lowering
+  //   // and avoid custom patterns for these ops.
+  //   getOperation()->walk([&](Operation *op) {
+  //     if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
+  //       vector::moveScalarUniformCode(warpOp);
+  //     }
+  //   });
+  // }
+  // // Finally, do the SIMD to SIMT distribution.
+  // RewritePatternSet patterns(&getContext());
+  // xegpu::populateXeGPUSubgroupDistributePatterns(patterns);
+  // // TODO: distributionFn and shuffleFn are not used at this point.
+  // auto distributionFn = [](Value val) {
+  //   VectorType vecType = dyn_cast<VectorType>(val.getType());
+  //   int64_t vecRank = vecType ? vecType.getRank() : 0;
+  //   OpBuilder builder(val.getContext());
+  //   if (vecRank == 0)
+  //     return AffineMap::get(val.getContext());
+  //   return AffineMap::getMultiDimIdentityMap(vecRank, val.getContext());
+  // };
+  // auto shuffleFn = [](Location loc, OpBuilder &builder, Value val, Value
+  // srcIdx,
+  //                     int64_t warpSz) { return Value(); };
+  // vector::populatePropagateWarpVectorDistributionPatterns(
+  //     patterns, distributionFn, shuffleFn);
+  // if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
+  //   signalPassFailure();
+  //   return;
+  // }
+
+  // // Clean up UnrealizedConversionCastOps that were inserted due to tensor
+  // desc
+  // // type mismatches created by using upstream distribution patterns
+  // (scf.for) getOperation()->walk([&](mlir::UnrealizedConversionCastOp op) {
+  //   // We are only interested in UnrealizedConversionCastOps there were added
+  //   // for resolving SIMT type mismatches.
+  //   if (!op->getAttr(resolveSIMTTypeMismatch))
+  //     return WalkResult::skip();
+
+  //   Value input = op.getOperand(0);
+  //   Value output = op.getResult(0);
+
+  //   // Both input and output must have tensor descriptor types.
+  //   xegpu::TensorDescType inputDescType =
+  //       mlir::dyn_cast<xegpu::TensorDescType>(input.getType());
+  //   xegpu::TensorDescType outputDescType =
+  //       mlir::dyn_cast<xegpu::TensorDescType>(output.getType());
+  //   assert(inputDescType && outputDescType &&
+  //          "Unrealized conversion cast must have tensor descriptor types");
+
+  //   // tensor_desc<shape, layout> -> tensor_desc<shape> Type of conversions.
+  //   // This occurs iside scf.for body to resolve the block argument type to
+  //   SIMT
+  //   // type.
+  //   if (inputDescType.getLayout()) {
+  //     auto argument = mlir::dyn_cast<mlir::BlockArgument>(input);
+  //     if (argument) {
+  //       argument.setType(output.getType());
+  //       output.replaceAllUsesWith(argument);
+  //       if (auto loopOp = mlir::dyn_cast<mlir::LoopLikeOpInterface>(
+  //               argument.getOwner()->getParentOp())) {
+  //         auto result = loopOp.getTiedLoopResult(argument);
+  //         result.setType(output.getType());
+  //       }
+  //     }
+  //   }
+
+  //   // tensor_desc<shape> -> tensor_desc<shape, layout> Type of
+  //   // conversions. This occurs at the yield op of scf.for body to go back
+  //   from
+  //   // SIMT type to original type.
+  //   if (outputDescType.getLayout())
+  //     output.replaceAllUsesWith(input);
+
+  //   if (op->use_empty())
+  //     op->erase();
+  //   return WalkResult::advance();
+  // });
 }