[Mlir-commits] [mlir] [mlir][xegpu] SIMT distribution patterns for XeGPU CreateNdTdesc, LoadNd, StoreNd and Dpas Ops. (PR #135271)

Wed Apr 16 13:13:49 PDT 2025

================
@@ -634,6 +678,643 @@ void RunSGMapPropagation::printAnalysisResult(llvm::raw_ostream &os) {
   }
 }
 
+static void attachLayoutAttributeToUsers(Value v, xegpu::LayoutAttr layout) {
+  for (OpOperand &user : v.getUses()) {
+    Operation *owner = user.getOwner();
+    unsigned operandNumber = user.getOperandNumber();
+    /// Use a generic name for ease of querying the layout attribute later.
+    std::string attrName =
+        operandLayoutNamePrefix + std::to_string(operandNumber);
+    owner->setAttr(attrName, layout);
+  }
+}
+
+static LogicalResult attachLayoutAttributes(
+    Operation *top, llvm::function_ref<LayoutInfo(Value)> getPropagatedLayout) {
+  /// Helper to convert the layout info to the xegpu::LayoutAttr.
+  auto getLayoutInfoForResult = [&](Value r) -> xegpu::LayoutAttr {
+    auto layout = getPropagatedLayout(r);
+    if (!layout.isAssigned())
+      return {};
+    SmallVector<int, 2> laneLayout, laneData;
+    for (auto [layout, data] : llvm::zip_equal(layout.getLayoutAsArrayRef(),
+                                               layout.getDataAsArrayRef())) {
+      laneLayout.push_back(static_cast<int>(layout));
+      laneData.push_back(static_cast<int>(data));
+    }
+    return xegpu::LayoutAttr::get(r.getContext(), laneLayout, laneData);
+  };
+  /// Attach the layout attributes to the results of the operations.
+  auto walkResult = top->walk([&](Operation *op) {
+    /// For function ops, propagate the argument layout to the users.
+    if (auto func = dyn_cast<FunctionOpInterface>(op)) {
+      for (auto arg : func.getArguments()) {
+        auto layoutInfo = getLayoutInfoForResult(arg);
+        if (layoutInfo) {
+          attachLayoutAttributeToUsers(arg, layoutInfo);
+        }
+      }
+      return WalkResult::advance();
+    }
+    /// If no results, move on.
+    if (op->getNumResults() == 0)
+      return WalkResult::advance();
+    /// If all the results are scalars, move on.
+    if (llvm::all_of(op->getResultTypes(),
+                     [](Type t) { return t.isIntOrIndexOrFloat(); }))
+      return WalkResult::advance();
+
+    if (auto tensorDescTy =
+            dyn_cast<xegpu::TensorDescType>(op->getResult(0).getType())) {
+      auto layoutInfo = getLayoutInfoForResult(op->getResult(0));
+      if (!layoutInfo) {
+        LLVM_DEBUG(DBGS() << "No layout for result of " << *op << "\n");
+        return WalkResult::interrupt();
+      }
+
+      /// Clone the op, attach the sg_map to the result tensor descriptor, and
+      /// remove the original op.
+      OpBuilder builder(op);
+      auto *newOp = builder.clone(*op);
+      auto newTensorDescTy = xegpu::TensorDescType::get(
+          tensorDescTy.getContext(), tensorDescTy.getShape(),
+          tensorDescTy.getElementType(), tensorDescTy.getEncoding(),
+          layoutInfo);
+      newOp->getResult(0).setType(newTensorDescTy);
+      op->replaceAllUsesWith(newOp->getResults());
+      op->erase();
+      return WalkResult::advance();
+    }
+    /// Otherwise simply attach the layout to the op itself.
+    for (auto [i, r] : llvm::enumerate(op->getResults())) {
+      auto layoutInfo = getLayoutInfoForResult(r);
+      if (layoutInfo) {
+        auto attrName = resultLayoutNamePrefix + std::to_string(i);
+        op->setAttr(attrName, layoutInfo);
+        /// Attach the layout attribute to the users of the result.
+        attachLayoutAttributeToUsers(r, layoutInfo);
+      }
+    }
+    return WalkResult::advance();
+  });
+
+  return failure(walkResult.wasInterrupted());
+}
+
+static LogicalResult resolveLayoutConflicts(Operation *top) {
+  /// TODO: Implement the layout conflict resolution.
+  return success();
+}
+
+namespace {
+
+///===----------------------------------------------------------------------===///
+/// SIMT Distribution Patterns
+///===----------------------------------------------------------------------===///
+
+/// Returns the distributed vector type for a source vector type according to
+/// the lane_layout. We simply divide each dimension of tensor descriptor shape
+/// by corresponding lane_layout dimension. If array_length > 1, that is
+/// appended to the front of the disributed shape.
+///
+/// Examples:
+/// | original vector shape | lane_layout | distributed vector shape |
+/// |-----------------------|-------------|--------------------------|
+/// | 32x16                 | [1, 16]     | 32x1                     |
+/// | 32x16                 | [2, 8]      | 16x2                     |
+/// | 2x32x16               | [1, 16]     | 2x32x1                   |
+FailureOr<VectorType> getDistVecTypeBasedOnLaneLayout(xegpu::LayoutAttr layout,
+                                                      VectorType originalType) {
+  if (!layout)
+    return failure();
+
+  auto laneLayout = layout.getLaneLayout().asArrayRef();
+  assert(originalType.getShape().size() >= laneLayout.size() &&
+         "Rank of the original vector type should be greater or equal to the "
+         "size of the lane layout to distribute the vector type.");
+  SmallVector<int64_t> distributedShape(originalType.getShape());
+  /// Only distribute the last `laneLayout.size()` dimensions. The remaining
+  /// dimensions are not distributed.
+  unsigned distributionStart = originalType.getRank() - laneLayout.size();
+  for (auto [i, dim] : llvm::enumerate(originalType.getShape())) {
+    if (i < distributionStart) {
+      continue;
+    }
+    /// Check if the dimension can be distributed evenly.
+    if (dim % laneLayout[i - distributionStart] != 0)
+      return failure();
+    distributedShape[i] = dim / laneLayout[i - distributionStart];
+  }
+  return VectorType::get(distributedShape, originalType.getElementType());
+}
+
+static VectorType getDistributedVectorType(xegpu::LayoutAttr layout,
+                                           VectorType originalType) {
+  auto shape = originalType.getShape();
+  auto distVecTyOrFailure =
+      xegpu::TensorDescType::get(shape, originalType.getElementType(),
+                                 /*array_length=*/1, /*boundary_check=*/true,
+                                 /*memory_space=*/xegpu::MemorySpace::Global,
+                                 layout)
+          .getDistributedVectorType();
+  assert(llvm::succeeded(distVecTyOrFailure) &&
+         "Failed to compute distributed vector type for the given vector type");
+  return distVecTyOrFailure.value();
+}
+
+static xegpu::TensorDescType dropLayouts(xegpu::TensorDescType tensorDesc) {
+  return xegpu::TensorDescType::get(
+      tensorDesc.getContext(), tensorDesc.getShape(),
+      tensorDesc.getElementType(), tensorDesc.getEncoding(),
+      xegpu::LayoutAttr());
+}
+
+template <typename T>
+static Value resolveDistributedTy(Value orig, T expected,
+                                  PatternRewriter &rewriter) {
+  /// If orig and expected types are the same, return orig.
+  if (orig.getType() == expected)
+    return orig;
+  /// If orig is a vector type, create a shape cast op to reconcile the types.
+  if (auto origVecType = isa<VectorType>(orig.getType())) {
+    auto castOp =
+        rewriter.create<vector::ShapeCastOp>(orig.getLoc(), expected, orig);
+    return castOp.getResult();
+  }
+  /// If orig is a tensor descriptor type, create an unrealized conversion cast
+  /// op to reconcile the types.
+  if (auto origTensorDescTy = isa<xegpu::TensorDescType>(orig.getType())) {
+    auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+                                                              expected, orig);
+    return castOp.getResult(0);
+  }
+  llvm_unreachable("Unsupported type for reconciliation");
+  return orig;
+}
+
+// static Value reconcileDistributedTensorDescTy(Value orig,
+//                                               xegpu::TensorDescType expected,
+//                                               PatternRewriter &rewriter) {
+//   assert(isa<xegpu::TensorDescType>(orig.getType()) &&
+//          "expecting tensor descriptor type");
+//   auto origTensorDescTy = cast<xegpu::TensorDescType>(orig.getType());
+//   /// No need to reconcile if the types are the same.
+//   if (origTensorDescTy == expected)
+//     return orig;
+//   auto castOp = rewriter.create<UnrealizedConversionCastOp>(orig.getLoc(),
+//                                                             expected, orig);
+//   return castOp.getResult(0);
+// }
+
+// // unify above 2 functions with a template
+// template <typename T>
+// static Value reconcileDistributedType(Value orig, T expected,
+//                                        PatternRewriter &rewriter) {
+//   if constexpr (std::is_same_v<T, VectorType>) {
+//     return reconcileDistributedVecType(orig, expected, rewriter);
+//   } else if constexpr (std::is_same_v<T, xegpu::TensorDescType>) {
+//     return reconcileDistributedTensorDescTy(orig, expected, rewriter);
+//   } else {
+//     static_assert(llvm::is_one_of<T, VectorType,
+//     xegpu::TensorDescType>::value,
+//                   "Unsupported type for reconciliation");
+//   }
+//   return orig;
+// }
+
+static SmallVector<NamedAttribute>
+filterTemporaryLayoutAttributes(ArrayRef<NamedAttribute> attrs) {
+  SmallVector<NamedAttribute> newAttrs;
+  for (auto attr : attrs) {
+    if (attr.getName().strref().contains(operandLayoutNamePrefix) ||
+        attr.getName().strref().contains(resultLayoutNamePrefix)) {
+      continue;
+    }
+    newAttrs.push_back(attr);
+  }
+  return newAttrs;
+}
+
+/// Given a GPUFuncOp, this pattern creates a new GPUFuncOp and moves the body
+/// of the original GPUFuncOp to the new GPUFuncOp such that entire body is
+/// contained within a WarpExecuteOnLane0Op.
+/// Example:
+///
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     ...
+///     ...
+///     gpu.return %result: vector<8x16xf32>
+///   }
+/// ```
+/// To
+/// ```
+///   gpu.func @foo(%arg0: memref<*xf16>) -> vector<8x16xf32> {
+///     %laneid = gpu.lane_id : index
+///     %0 = gpu.warp_execute_on_lane_0(%laneid) -> vector<8x16xf32> {
+///       ...
+///       ...
+///       gpu.yield %result: vector<8x16xf32>
+///     }
+///     return %0
+///   }
+struct MoveFuncBodyToWarpExecuteOnLane0
+    : public OpRewritePattern<gpu::GPUFuncOp> {
+  using OpRewritePattern<gpu::GPUFuncOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp,
+                                PatternRewriter &rewriter) const override {
+    /// If the function only contains a single void return, skip.
+    if (llvm::all_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::ReturnOp>(op) && !op.getNumOperands();
+        }))
+      return failure();
+    /// If the function already moved inside a warp_execute_on_lane0, skip.
+    if (llvm::any_of(gpuFuncOp.getBody().getOps(), [](Operation &op) {
+          return isa<gpu::WarpExecuteOnLane0Op>(op);
+        }))
+      return failure();
+    /// Create a new function with the same signature.
+    auto newGpuFunc = rewriter.create<gpu::GPUFuncOp>(
+        gpuFuncOp.getLoc(), gpuFuncOp.getName(), gpuFuncOp.getFunctionType());
+    /// Create a WarpExecuteOnLane0Op with same arguments and results as the
+    /// original gpuFuncOp.
+    rewriter.setInsertionPointToEnd(&newGpuFunc.getFunctionBody().front());
+    auto laneId = rewriter.create<gpu::LaneIdOp>(
+        newGpuFunc.getLoc(), rewriter.getIndexType(),
+        /** upperBound = **/ mlir::IntegerAttr());
+    auto gpuFuncResultType = gpuFuncOp.getFunctionType().getResults();
+    auto warpOp = rewriter.create<gpu::WarpExecuteOnLane0Op>(
+        laneId.getLoc(), gpuFuncResultType, laneId, subgroupSize,
+        newGpuFunc.getArguments(), newGpuFunc.getArgumentTypes());
+    auto &warpBodyBlock = warpOp.getBodyRegion().front();
+    /// Replace the ReturnOp of the original gpu function with a YieldOp.
+    auto origRetunOp =
+        cast<gpu::ReturnOp>(gpuFuncOp.getBlocks().back().getTerminator());
+    rewriter.setInsertionPointAfter(origRetunOp);
+    rewriter.create<gpu::YieldOp>(origRetunOp.getLoc(),
+                                  origRetunOp.getOperands());
+    rewriter.eraseOp(origRetunOp);
+    /// Move the original function body to the WarpExecuteOnLane0Op body.
+    rewriter.inlineRegionBefore(gpuFuncOp.getBody(), warpOp.getBodyRegion(),
+                                warpOp.getBodyRegion().begin());
+    rewriter.eraseBlock(&warpBodyBlock);
+    /// Insert a new ReturnOp after the WarpExecuteOnLane0Op.
+    rewriter.setInsertionPointAfter(warpOp);
+    rewriter.create<gpu::ReturnOp>(newGpuFunc.getLoc(), warpOp.getResults());
+    rewriter.replaceOp(gpuFuncOp, newGpuFunc);
+    return success();
+  }
+};
+
+/// Clone a create_nd_tdesc feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op. The warp op
+/// will still contain the original op that will not be used by the yield op
+/// (and should be cleaned up later with dce). The yield op will bypass the
+/// create_nd_tdesc's arguments. Tensor descriptor is not distributed because
+/// it is a uniform value accorss all work items within the subgroup.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (!xegpu.tensor_desc<4x8xf32>) {
+///     ...
+///     %td = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+///     vector.yield %td
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     %dead = xegpu.create_nd_tdesc %arg0[0, 0]
+///               : memref<4x8xf32> -> !xegpu.tensor_desc<4x8xf32>
+///     vector.yield %arg0, %dead
+///   }
+///   %td = xegpu.create_nd_tdesc %r#0[0, 0]: memref<4x8xf32>
+///                                 -> !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct CreateNdDescDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::CreateNdDescOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::CreateNdDesc op");
+    auto descOp = operand->get().getDefiningOp<xegpu::CreateNdDescOp>();
+    unsigned operandIdx = operand->getOperandNumber();
+
+    auto srcTypedVal = dyn_cast<TypedValue<MemRefType>>(descOp.getSource());
+    if (!srcTypedVal)
+      return rewriter.notifyMatchFailure(
+          descOp, "expecting a memref typed value as the source");
+
+    auto descOffsets = descOp.getMixedOffsets();
+
+    xegpu::LayoutAttr layout = descOp.getType().getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          descOp, "the tensor descriptor lacks sg_map attribute");
+
+    SmallVector<size_t> newRetIndices;
+    SmallVector<Value> newYieldValues;
+    SmallVector<Type> newYieldTypes;
+
+    for (auto arg : descOp->getOperands()) {
+      newYieldValues.push_back(arg);
+      newYieldTypes.push_back(arg.getType());
+    }
+    rewriter.setInsertionPoint(subgroupOp);
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp, /* new yieled values = */ newYieldValues,
+        /* new yielded types = */ newYieldTypes, newRetIndices);
+
+    SmallVector<Value> newDescOperands;
+    for (auto i : newRetIndices) {
+      newDescOperands.push_back(newWarpOp.getResult(i));
+    }
+    rewriter.setInsertionPointAfter(newWarpOp);
+    auto distributedTensorDescTy =
+        dropLayouts(descOp.getType()); /// Distributed tensor descriptor type
+                                       /// does not contain layout info.
+    auto newDescOp = rewriter.create<xegpu::CreateNdDescOp>(
+        newWarpOp.getLoc(), distributedTensorDescTy, newDescOperands,
+        descOp->getAttrs());
+
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    rewriter.replaceAllUsesWith(distributedVal, newDescOp);
+    return success();
+  }
+};
+
+/// Sink a store_nd op at the end of enclosing `gpu.warp_execute_on_lane_0`.
+/// In case arguments for the store are passed through the warp op interface
+/// they would be propagated as returned values. Only the source vector for
+/// the store is distributed according to sg_map attribute.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     xegpu.store_nd %arg0, %arg1: vector<4x8xf32>,
+///                                 !xegpu.tensor_desc<4x8xf32>
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     gpu.yield %arg0, %arg1: vector<4x8xf32>, !xegpu.tensor_desc<4x8xf32>
+///   }
+///   xegpu.store_nd %r#0, %r#1: vector<4x1xf32>,
+///     !xegpu.tensor_desc<4x8xf32>
+///
+/// ```
+struct StoreNdDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    auto yield = cast<gpu::YieldOp>(
+        subgroupOp.getBodyRegion().getBlocks().begin()->getTerminator());
+    Operation *lastNode = yield->getPrevNode();
+    auto storeOp = dyn_cast_or_null<xegpu::StoreNdOp>(lastNode);
+    if (!storeOp)
+      return failure();
+
+    auto tensorDescTy = storeOp.getTensorDescType();
+    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          storeOp, "the source tensor descriptor lacks sg_map attribute");
+
+    auto distributedTypeByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, storeOp.getValueType());
+    if (failed(distributedTypeByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(storeOp,
+                                         "Failed to distribute the type");
+    VectorType distributedTypeByWarpOp =
+        distributedTypeByWarpOpOrFailure.value();
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp,
+        /* new yielded values = */
+        ValueRange{storeOp.getValue(), storeOp.getTensorDesc()},
+        /* new yielded types = */
+        TypeRange{distributedTypeByWarpOp, storeOp.getTensorDescType()},
+        newRetIndices);
+    /// Create a new store op outside the warp op with the distributed vector
+    /// type. Tensor descriptor is not distributed.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    SmallVector<Value> newStoreOperands;
+
+    /// For the value operand, there can be a mismatch between the vector type
+    /// distributed by the warp op and (xegpu-specific) distributed type
+    /// supported by the store op. Type mismatch must be resolved using
+    /// appropriate cast op.
+    auto storeNdDistributedValueTyOrFailure =
+        storeOp.getTensorDescType().getDistributedVectorType();
+    if (failed(storeNdDistributedValueTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          storeOp, "Failed to get distributed vector type for the store op");
+    newStoreOperands.push_back(resolveDistributedTy(
+        newWarpOp.getResult(newRetIndices[0]),
+        storeNdDistributedValueTyOrFailure.value(), rewriter));
+    /// For the tensor descriptor operand, the layout attibute is dropped after
+    /// distribution. Types needs to be resolved in this case also.
+    auto distributedTensorDescTy = dropLayouts(storeOp.getTensorDescType());
+    newStoreOperands.push_back(
+        resolveDistributedTy(newWarpOp.getResult(newRetIndices[1]),
+                             distributedTensorDescTy, rewriter));
+
+    rewriter.create<xegpu::StoreNdOp>(
+        newWarpOp.getLoc(), TypeRange{}, newStoreOperands,
+        filterTemporaryLayoutAttributes(storeOp->getAttrs()));
+    rewriter.eraseOp(storeOp);
+    return success();
+  }
+};
+
+/// Clone a load_nd feeding into vector.yield op for the enclosing
+/// `gpu.warp_execute_on_lane_0` and put it after the warp op.
+/// The warp op will still contain the original op that will not be used by
+/// the yield op (and should be cleaned up later with dce). The yield op will
+/// bypass the load's arguments. Only the loaded vector is distributed
+/// according to sg_map attribute and, tensor descriptor types is not
+/// distributed.
+///
+/// Example:
+///
+/// ```
+///   #sg_map_8 = #xegpu.sg_map<wi_layout = [1, 8], wi_data = [1, 1]>
+///   %r = gpu.warp_execute_on_lane_0(%laneid) ->
+///                   (vector<4x1xf32>) {
+///     ...
+///     %ld = xegpu.load_nd %arg0, %arg1: !xegpu.tensor_desc<4x8xf32> ->
+///       vector<4x8xf32>
+///     gpu.yield %ld
+///   }
+/// ```
+/// To
+/// ```
+///   %r:2 = gpu.warp_execute_on_lane_0(%laneid) -> () {
+///     ...
+///     %dead = xegpu.load_nd %arg0: !xegpu.tensor_desc<4x8xf32> ->
+///     vector<4x8xf32> gpu.yield %arg0, %arg1
+///   }
+///   %ld = xegpu.load_nd %r#0: !xegpu.tensor_desc<4x8xf32> -> vector<4x1xf32>
+///
+/// ```
+struct LoadNdDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op subgroupOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *operand =
+        getWarpResult(subgroupOp, llvm::IsaPred<xegpu::LoadNdOp>);
+    if (!operand)
+      return rewriter.notifyMatchFailure(
+          subgroupOp, "warp result is not a xegpu::LoadNd op");
+
+    auto loadOp = operand->get().getDefiningOp<xegpu::LoadNdOp>();
+    xegpu::TensorDescType tensorDescTy = loadOp.getTensorDescType();
+    xegpu::LayoutAttr layout = tensorDescTy.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          loadOp, "the source tensor descriptor lacks sg_map attribute");
+
+    unsigned operandIdx = operand->getOperandNumber();
+    VectorType distributedTypeByWarpOp =
+        cast<VectorType>(subgroupOp.getResult(operandIdx).getType());
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, subgroupOp,
+        /* new yielded values = */ loadOp.getTensorDesc(),
+        /* new yielded types = */ tensorDescTy, newRetIndices);
+
+    /// Create a new load op outside the warp op with the distributed vector
+    /// type.
+    rewriter.setInsertionPointAfter(newWarpOp);
+    auto loadNdDistValueTyOrFailure =
+        loadOp.getTensorDescType().getDistributedVectorType();
+    if (failed(loadNdDistValueTyOrFailure))
+      return rewriter.notifyMatchFailure(
+          loadOp, "Failed to get distributed vector type for the load op");
+    auto distributedTensorDescTy =
+        dropLayouts(loadOp.getTensorDescType()); /// Distributed tensor
+                                                 /// descriptor type does not
+                                                 /// contain layout info.
+    Value newLoadOp = rewriter.create<xegpu::LoadNdOp>(
+        newWarpOp.getLoc(), loadNdDistValueTyOrFailure.value(),
+        resolveDistributedTy(newWarpOp->getResult(newRetIndices[0]),
+                             distributedTensorDescTy, rewriter),
+        filterTemporaryLayoutAttributes(loadOp->getAttrs()));
+    Value distributedVal = newWarpOp.getResult(operandIdx);
+    /// There can be a conflict between the vector type distributed by the
+    /// warp op and (xegpu-specific) distributed type supported by the load
+    /// op. Resolve these mismatches by inserting a cast.
+    newLoadOp =
+        resolveDistributedTy(newLoadOp, distributedTypeByWarpOp, rewriter);
+    rewriter.replaceAllUsesWith(distributedVal, newLoadOp);
+    return success();
+  }
+};
+
+struct DpasDistribution final : public gpu::WarpDistributionPattern {
----------------
charithaintc wrote:

added all the missing comments and refactored code for clarity. 

https://github.com/llvm/llvm-project/pull/135271