[Mlir-commits] [mlir] [MLIR] Move warp_execute_on_lane_0 from vector to gpu (PR #116994)

Wed Nov 20 07:45:09 PST 2024

https://github.com/kurapov-peter created https://github.com/llvm/llvm-project/pull/116994

Please see the related RFC here: https://discourse.llvm.org/t/rfc-move-execute-on-lane-0-from-vector-to-gpu-dialect/82989.

This patch does exactly one thing - moves the op to gpu.

>From 5a26f6257ffe7f0c06c9dc9029ccbcfdd0671cd4 Mon Sep 17 00:00:00 2001
From: Petr Kurapov <petr.a.kurapov at intel.com>
Date: Wed, 20 Nov 2024 15:39:00 +0000
Subject: [PATCH] [MLIR] Move warp_execute_on_lane_0 from vector to gpu

---
 mlir/include/mlir/Dialect/GPU/IR/GPUOps.td    | 138 ++++++
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 133 -----
 .../Vector/Transforms/VectorDistribution.h    |  17 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        | 182 +++++++
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 182 -------
 .../Vector/Transforms/VectorDistribute.cpp    |  98 ++--
 .../Conversion/GPUCommon/transfer_write.mlir  |   2 +-
 mlir/test/Dialect/GPU/invalid.mlir            |  86 ++++
 mlir/test/Dialect/GPU/ops.mlir                |  36 ++
 mlir/test/Dialect/Vector/invalid.mlir         |  86 ----
 mlir/test/Dialect/Vector/ops.mlir             |  35 --
 .../Vector/vector-warp-distribute.mlir        | 456 +++++++++---------
 .../GPU/CUDA/test-reduction-distribute.mlir   |   2 +-
 .../Vector/GPU/CUDA/test-warp-distribute.mlir |   2 +-
 .../Dialect/Vector/TestVectorTransforms.cpp   |  11 +-
 15 files changed, 738 insertions(+), 728 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
index 6098eb34d04d52..5b1d7bb87a219a 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1097,6 +1097,10 @@ def GPU_YieldOp : GPU_Op<"yield", [Pure, ReturnLike, Terminator]>,
     ```
   }];
 
+  let builders = [
+    OpBuilder<(ins), [{ /* nothing to do */ }]>
+  ];
+
   let assemblyFormat = "attr-dict ($values^ `:` type($values))?";
 }
 
@@ -2921,4 +2925,138 @@ def GPU_SetCsrPointersOp : GPU_Op<"set_csr_pointers", [GPU_AsyncOpInterface]> {
   }];
 }
 
+def GPU_WarpExecuteOnLane0Op : GPU_Op<"warp_execute_on_lane_0",
+      [DeclareOpInterfaceMethods<RegionBranchOpInterface, ["areTypesCompatible"]>,
+       SingleBlockImplicitTerminator<"gpu::YieldOp">,
+       RecursiveMemoryEffects]> {
+  let summary = "Executes operations in the associated region on thread #0 of a"
+                "SPMD program";
+  let description = [{
+    `warp_execute_on_lane_0` is an operation used to bridge the gap between
+    vector programming and SPMD programming model like GPU SIMT. It allows to
+    trivially convert a region of vector code meant to run on a multiple threads
+    into a valid SPMD region and then allows incremental transformation to
+    distribute vector operations on the threads.
+
+    Any code present in the region would only be executed on first thread/lane
+    based on the `laneid` operand. The `laneid` operand is an integer ID between
+    [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in
+    a warp.
+
+    Operands are vector values distributed on all lanes that may be used by
+    the single lane execution. The matching region argument is a vector of all
+    the values of those lanes available to the single active lane. The
+    distributed dimension is implicit based on the shape of the operand and
+    argument. the properties of the distribution may be described by extra
+    attributes (e.g. affine map).
+
+    Return values are distributed on all lanes using laneId as index. The
+    vector is distributed based on the shape ratio between the vector type of
+    the yield and the result type.
+    If the shapes are the same this means the value is broadcasted to all lanes.
+    In the future the distribution can be made more explicit using affine_maps
+    and will support having multiple Ids.
+
+    Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy
+    between lane0 and the lanes of the warp. When distributing a vector
+    from lane0 to all the lanes, the data are distributed in a block cyclic way.
+    For example `vector<64xf32>` gets distributed on 32 threads and map to
+    `vector<2xf32>` where thread 0 contains vector[0] and vector[1].
+
+    During lowering values passed as operands and return value need to be
+    visible to different lanes within the warp. This would usually be done by
+    going through memory.
+
+    The region is *not* isolated from above. For values coming from the parent
+    region not going through operands only the lane 0 value will be accesible so
+    it generally only make sense for uniform values.
+
+    Example:
+    ```
+    // Execute in parallel on all threads/lanes.
+    gpu.warp_execute_on_lane_0 (%laneid)[32] {
+      // Serial code running only on thread/lane 0.
+      ...
+    }
+    // Execute in parallel on all threads/lanes.
+    ```
+
+    This may be lowered to an scf.if region as below:
+    ```
+      // Execute in parallel on all threads/lanes.
+      %cnd = arith.cmpi eq, %laneid, %c0 : index
+      scf.if %cnd {
+        // Serial code running only on thread/lane 0.
+        ...
+      }
+      // Execute in parallel on all threads/lanes.
+    ```
+
+    When the region has operands and/or return values:
+    ```
+    // Execute in parallel on all threads/lanes.
+    %0 = gpu.warp_execute_on_lane_0(%laneid)[32]
+    args(%v0 : vector<4xi32>) -> (vector<1xf32>) {
+    ^bb0(%arg0 : vector<128xi32>) :
+      // Serial code running only on thread/lane 0.
+      ...
+      gpu.yield %1 : vector<32xf32>
+    }
+    // Execute in parallel on all threads/lanes.
+    ```
+
+    values at the region boundary would go through memory:
+    ```
+    // Execute in parallel on all threads/lanes.
+    ...
+    // Store the data from each thread into memory and Synchronization.
+    %tmp0 = memreg.alloc() : memref<128xf32>
+    %tmp1 = memreg.alloc() : memref<32xf32>
+    %cnd = arith.cmpi eq, %laneid, %c0 : index
+    vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32>
+    some_synchronization_primitive
+    scf.if %cnd {
+      // Serialized code running only on thread 0.
+      // Load the data from all the threads into a register from thread 0. This
+      // allow threads 0 to access data from all the threads.
+      %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32>
+      ...
+      // Store the data from thread 0 into memory.
+      vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32>
+    }
+    // Synchronization and load the data in a block cyclic way so that the
+    // vector is distributed on all threads.
+    some_synchronization_primitive
+    %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32>
+    // Execute in parallel on all threads/lanes.
+    ```
+
+  }];
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+  let arguments = (ins Index:$laneid, I64Attr:$warp_size,
+                       Variadic<AnyType>:$args);
+  let results = (outs Variadic<AnyType>:$results);
+  let regions = (region SizedRegion<1>:$warpRegion);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid,
+                   "int64_t":$warpSize)>,
+    // `blockArgTypes` are different than `args` types as they are they
+    // represent all the `args` instances visibile to lane 0. Therefore we need
+    // to explicit pass the type.
+    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid,
+                   "int64_t":$warpSize, "ValueRange":$args,
+                   "TypeRange":$blockArgTypes)>
+  ];
+
+  let extraClassDeclaration = [{
+    bool isDefinedOutsideOfRegion(Value value) {
+      return !getRegion().isAncestor(value.getParentRegion());
+    }
+  }];
+}
+
 #endif // GPU_OPS
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index c5b08d6aa022b1..d0f11acb448355 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -2983,138 +2983,5 @@ def Vector_YieldOp : Vector_Op<"yield", [
   let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
 }
 
-def Vector_WarpExecuteOnLane0Op : Vector_Op<"warp_execute_on_lane_0",
-      [DeclareOpInterfaceMethods<RegionBranchOpInterface, ["areTypesCompatible"]>,
-       SingleBlockImplicitTerminator<"vector::YieldOp">,
-       RecursiveMemoryEffects]> {
-  let summary = "Executes operations in the associated region on thread #0 of a"
-                "SPMD program";
-  let description = [{
-    `warp_execute_on_lane_0` is an operation used to bridge the gap between
-    vector programming and SPMD programming model like GPU SIMT. It allows to
-    trivially convert a region of vector code meant to run on a multiple threads
-    into a valid SPMD region and then allows incremental transformation to
-    distribute vector operations on the threads.
-
-    Any code present in the region would only be executed on first thread/lane
-    based on the `laneid` operand. The `laneid` operand is an integer ID between
-    [0, `warp_size`). The `warp_size` attribute indicates the number of lanes in
-    a warp.
-
-    Operands are vector values distributed on all lanes that may be used by
-    the single lane execution. The matching region argument is a vector of all
-    the values of those lanes available to the single active lane. The
-    distributed dimension is implicit based on the shape of the operand and
-    argument. the properties of the distribution may be described by extra
-    attributes (e.g. affine map).
-
-    Return values are distributed on all lanes using laneId as index. The
-    vector is distributed based on the shape ratio between the vector type of
-    the yield and the result type.
-    If the shapes are the same this means the value is broadcasted to all lanes.
-    In the future the distribution can be made more explicit using affine_maps
-    and will support having multiple Ids.
-
-    Therefore the `warp_execute_on_lane_0` operations allow to implicitly copy
-    between lane0 and the lanes of the warp. When distributing a vector
-    from lane0 to all the lanes, the data are distributed in a block cyclic way.
-    For exemple `vector<64xf32>` gets distributed on 32 threads and map to
-    `vector<2xf32>` where thread 0 contains vector[0] and vector[1].
-
-    During lowering values passed as operands and return value need to be
-    visible to different lanes within the warp. This would usually be done by
-    going through memory.
-
-    The region is *not* isolated from above. For values coming from the parent
-    region not going through operands only the lane 0 value will be accesible so
-    it generally only make sense for uniform values.
-
-    Example:
-    ```
-    // Execute in parallel on all threads/lanes.
-    vector.warp_execute_on_lane_0 (%laneid)[32] {
-      // Serial code running only on thread/lane 0.
-      ...
-    }
-    // Execute in parallel on all threads/lanes.
-    ```
-
-    This may be lowered to an scf.if region as below:
-    ```
-      // Execute in parallel on all threads/lanes.
-      %cnd = arith.cmpi eq, %laneid, %c0 : index
-      scf.if %cnd {
-        // Serial code running only on thread/lane 0.
-        ...
-      }
-      // Execute in parallel on all threads/lanes.
-    ```
-
-    When the region has operands and/or return values:
-    ```
-    // Execute in parallel on all threads/lanes.
-    %0 = vector.warp_execute_on_lane_0(%laneid)[32]
-    args(%v0 : vector<4xi32>) -> (vector<1xf32>) {
-    ^bb0(%arg0 : vector<128xi32>) :
-      // Serial code running only on thread/lane 0.
-      ...
-      vector.yield %1 : vector<32xf32>
-    }
-    // Execute in parallel on all threads/lanes.
-    ```
-
-    values at the region boundary would go through memory:
-    ```
-    // Execute in parallel on all threads/lanes.
-    ...
-    // Store the data from each thread into memory and Synchronization.
-    %tmp0 = memreg.alloc() : memref<128xf32>
-    %tmp1 = memreg.alloc() : memref<32xf32>
-    %cnd = arith.cmpi eq, %laneid, %c0 : index
-    vector.store %v0, %tmp0[%laneid] : memref<128xf32>, vector<4xf32>
-    some_synchronization_primitive
-    scf.if %cnd {
-      // Serialized code running only on thread 0.
-      // Load the data from all the threads into a register from thread 0. This
-      // allow threads 0 to access data from all the threads.
-      %arg0 = vector.load %tmp0[%c0] : memref<128xf32>, vector<128xf32>
-      ...
-      // Store the data from thread 0 into memory.
-      vector.store %1, %tmp1[%c0] : memref<32xf32>, vector<32xf32>
-    }
-    // Synchronization and load the data in a block cyclic way so that the
-    // vector is distributed on all threads.
-    some_synchronization_primitive
-    %0 = vector.load %tmp1[%laneid] : memref<32xf32>, vector<32xf32>
-    // Execute in parallel on all threads/lanes.
-    ```
-
-  }];
-
-  let hasVerifier = 1;
-  let hasCustomAssemblyFormat = 1;
-  let arguments = (ins Index:$laneid, I64Attr:$warp_size,
-                       Variadic<AnyType>:$args);
-  let results = (outs Variadic<AnyType>:$results);
-  let regions = (region SizedRegion<1>:$warpRegion);
-
-  let skipDefaultBuilders = 1;
-  let builders = [
-    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid,
-                   "int64_t":$warpSize)>,
-    // `blockArgTypes` are different than `args` types as they are they
-    // represent all the `args` instances visibile to lane 0. Therefore we need
-    // to explicit pass the type.
-    OpBuilder<(ins "TypeRange":$resultTypes, "Value":$laneid,
-                   "int64_t":$warpSize, "ValueRange":$args,
-                   "TypeRange":$blockArgTypes)>
-  ];
-
-  let extraClassDeclaration = [{
-    bool isDefinedOutsideOfRegion(Value value) {
-      return !getRegion().isAncestor(value.getParentRegion());
-    }
-  }];
-}
 
 #endif // MLIR_DIALECT_VECTOR_IR_VECTOR_OPS
diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
index 8907a2a583609a..dda45219b2acc2 100644
--- a/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
@@ -9,6 +9,7 @@
 #ifndef MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORDISTRIBUTION_H_
 #define MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORDISTRIBUTION_H_
 
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 
 namespace mlir {
@@ -23,15 +24,15 @@ struct WarpExecuteOnLane0LoweringOptions {
   /// type may be VectorType or a scalar) and be availble for the current warp.
   /// If there are several warps running in parallel the allocation needs to be
   /// split so that each warp has its own allocation.
-  using WarpAllocationFn =
-      std::function<Value(Location, OpBuilder &, WarpExecuteOnLane0Op, Type)>;
+  using WarpAllocationFn = std::function<Value(
+      Location, OpBuilder &, gpu::WarpExecuteOnLane0Op, Type)>;
   WarpAllocationFn warpAllocationFn = nullptr;
 
   /// Lamdba function to let user emit operation to syncronize all the thread
   /// within a warp. After this operation all the threads can see any memory
   /// written before the operation.
   using WarpSyncronizationFn =
-      std::function<void(Location, OpBuilder &, WarpExecuteOnLane0Op)>;
+      std::function<void(Location, OpBuilder &, gpu::WarpExecuteOnLane0Op)>;
   WarpSyncronizationFn warpSyncronizationFn = nullptr;
 };
 
@@ -48,17 +49,17 @@ using DistributionMapFn = std::function<AffineMap(Value)>;
 ///
 /// Example:
 /// ```
-/// %0 = vector.warp_execute_on_lane_0(%id){
+/// %0 = gpu.warp_execute_on_lane_0(%id){
 ///   ...
 ///   vector.transfer_write %v, %A[%c0] : vector<32xf32>, memref<128xf32>
-///   vector.yield
+///   gpu.yield
 /// }
 /// ```
 /// To
 /// ```
-/// %r:3 = vector.warp_execute_on_lane_0(%id) -> (vector<1xf32>) {
+/// %r:3 = gpu.warp_execute_on_lane_0(%id) -> (vector<1xf32>) {
 ///   ...
-///   vector.yield %v : vector<32xf32>
+///   gpu.yield %v : vector<32xf32>
 /// }
 /// vector.transfer_write %v, %A[%id] : vector<1xf32>, memref<128xf32>
 ///
@@ -73,7 +74,7 @@ void populateDistributeTransferWriteOpPatterns(
 
 /// Move scalar operations with no dependency on the warp op outside of the
 /// region.
-void moveScalarUniformCode(WarpExecuteOnLane0Op op);
+void moveScalarUniformCode(gpu::WarpExecuteOnLane0Op op);
 
 /// Lambda signature to compute a warp shuffle of a given value of a given lane
 /// within a given warp size.
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 956877497d9338..f019007faede8d 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -36,6 +36,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/StringSaver.h"
 #include <cassert>
+#include <numeric>
 
 using namespace mlir;
 using namespace mlir::gpu;
@@ -2188,6 +2189,187 @@ LogicalResult gpu::DynamicSharedMemoryOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// GPU WarpExecuteOnLane0Op
+//===----------------------------------------------------------------------===//
+
+void WarpExecuteOnLane0Op::print(OpAsmPrinter &p) {
+  p << "(" << getLaneid() << ")";
+
+  SmallVector<StringRef> coreAttr = {getWarpSizeAttrName()};
+  auto warpSizeAttr = getOperation()->getAttr(getWarpSizeAttrName());
+  p << "[" << llvm::cast<IntegerAttr>(warpSizeAttr).getInt() << "]";
+
+  if (!getArgs().empty())
+    p << " args(" << getArgs() << " : " << getArgs().getTypes() << ")";
+  if (!getResults().empty())
+    p << " -> (" << getResults().getTypes() << ')';
+  p << " ";
+  p.printRegion(getRegion(),
+                /*printEntryBlockArgs=*/true,
+                /*printBlockTerminators=*/!getResults().empty());
+  p.printOptionalAttrDict(getOperation()->getAttrs(), coreAttr);
+}
+
+ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser,
+                                        OperationState &result) {
+  // Create the region.
+  result.regions.reserve(1);
+  Region *warpRegion = result.addRegion();
+
+  auto &builder = parser.getBuilder();
+  OpAsmParser::UnresolvedOperand laneId;
+
+  // Parse predicate operand.
+  if (parser.parseLParen() ||
+      parser.parseOperand(laneId, /*allowResultNumber=*/false) ||
+      parser.parseRParen())
+    return failure();
+
+  int64_t warpSize;
+  if (parser.parseLSquare() || parser.parseInteger(warpSize) ||
+      parser.parseRSquare())
+    return failure();
+  result.addAttribute(getWarpSizeAttrName(OperationName(getOperationName(),
+                                                        builder.getContext())),
+                      builder.getI64IntegerAttr(warpSize));
+
+  if (parser.resolveOperand(laneId, builder.getIndexType(), result.operands))
+    return failure();
+
+  llvm::SMLoc inputsOperandsLoc;
+  SmallVector<OpAsmParser::UnresolvedOperand> inputsOperands;
+  SmallVector<Type> inputTypes;
+  if (succeeded(parser.parseOptionalKeyword("args"))) {
+    if (parser.parseLParen())
+      return failure();
+
+    inputsOperandsLoc = parser.getCurrentLocation();
+    if (parser.parseOperandList(inputsOperands) ||
+        parser.parseColonTypeList(inputTypes) || parser.parseRParen())
+      return failure();
+  }
+  if (parser.resolveOperands(inputsOperands, inputTypes, inputsOperandsLoc,
+                             result.operands))
+    return failure();
+
+  // Parse optional results type list.
+  if (parser.parseOptionalArrowTypeList(result.types))
+    return failure();
+  // Parse the region.
+  if (parser.parseRegion(*warpRegion, /*arguments=*/{},
+                         /*argTypes=*/{}))
+    return failure();
+  WarpExecuteOnLane0Op::ensureTerminator(*warpRegion, builder, result.location);
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+  return success();
+}
+
+void WarpExecuteOnLane0Op::getSuccessorRegions(
+    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
+  if (!point.isParent()) {
+    regions.push_back(RegionSuccessor(getResults()));
+    return;
+  }
+
+  // The warp region is always executed
+  regions.push_back(RegionSuccessor(&getWarpRegion()));
+}
+
+void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result,
+                                 TypeRange resultTypes, Value laneId,
+                                 int64_t warpSize) {
+  build(builder, result, resultTypes, laneId, warpSize,
+        /*operands=*/std::nullopt, /*argTypes=*/std::nullopt);
+}
+
+void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result,
+                                 TypeRange resultTypes, Value laneId,
+                                 int64_t warpSize, ValueRange args,
+                                 TypeRange blockArgTypes) {
+  result.addOperands(laneId);
+  result.addAttribute(getAttributeNames()[0],
+                      builder.getI64IntegerAttr(warpSize));
+  result.addTypes(resultTypes);
+  result.addOperands(args);
+  assert(args.size() == blockArgTypes.size());
+  OpBuilder::InsertionGuard guard(builder);
+  Region *warpRegion = result.addRegion();
+  Block *block = builder.createBlock(warpRegion);
+  for (auto [type, arg] : llvm::zip_equal(blockArgTypes, args))
+    block->addArgument(type, arg.getLoc());
+}
+
+/// Helper check if the distributed vector type is consistent with the expanded
+/// type and distributed size.
+static LogicalResult verifyDistributedType(Type expanded, Type distributed,
+                                           int64_t warpSize, Operation *op) {
+  // If the types matches there is no distribution.
+  if (expanded == distributed)
+    return success();
+  auto expandedVecType = llvm::dyn_cast<VectorType>(expanded);
+  auto distributedVecType = llvm::dyn_cast<VectorType>(distributed);
+  if (!expandedVecType || !distributedVecType)
+    return op->emitOpError("expected vector type for distributed operands.");
+  if (expandedVecType.getRank() != distributedVecType.getRank() ||
+      expandedVecType.getElementType() != distributedVecType.getElementType())
+    return op->emitOpError(
+        "expected distributed vectors to have same rank and element type.");
+
+  SmallVector<int64_t> scales(expandedVecType.getRank(), 1);
+  for (int64_t i = 0, e = expandedVecType.getRank(); i < e; i++) {
+    int64_t eDim = expandedVecType.getDimSize(i);
+    int64_t dDim = distributedVecType.getDimSize(i);
+    if (eDim == dDim)
+      continue;
+    if (eDim % dDim != 0)
+      return op->emitOpError()
+             << "expected expanded vector dimension #" << i << " (" << eDim
+             << ") to be a multipler of the distributed vector dimension ("
+             << dDim << ")";
+    scales[i] = eDim / dDim;
+  }
+  if (std::accumulate(scales.begin(), scales.end(), 1,
+                      std::multiplies<int64_t>()) != warpSize)
+    return op->emitOpError()
+           << "incompatible distribution dimensions from " << expandedVecType
+           << " to " << distributedVecType << " with warp size = " << warpSize;
+
+  return success();
+}
+
+LogicalResult WarpExecuteOnLane0Op::verify() {
+  if (getArgs().size() != getWarpRegion().getNumArguments())
+    return emitOpError(
+        "expected same number op arguments and block arguments.");
+  auto yield =
+      cast<YieldOp>(getWarpRegion().getBlocks().begin()->getTerminator());
+  if (yield.getNumOperands() != getNumResults())
+    return emitOpError(
+        "expected same number of yield operands and return values.");
+  int64_t warpSize = getWarpSize();
+  for (auto [regionArg, arg] :
+       llvm::zip_equal(getWarpRegion().getArguments(), getArgs())) {
+    if (failed(verifyDistributedType(regionArg.getType(), arg.getType(),
+                                     warpSize, getOperation())))
+      return failure();
+  }
+  for (auto [yieldOperand, result] :
+       llvm::zip_equal(yield.getOperands(), getResults())) {
+    if (failed(verifyDistributedType(yieldOperand.getType(), result.getType(),
+                                     warpSize, getOperation())))
+      return failure();
+  }
+  return success();
+}
+bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) {
+  return succeeded(
+      verifyDistributedType(lhs, rhs, getWarpSize(), getOperation()));
+}
+
 //===----------------------------------------------------------------------===//
 // GPU KernelMetadataAttr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index db199a46e1637c..2224c24dfc4330 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -6501,188 +6501,6 @@ void SplatOp::inferResultRanges(ArrayRef<ConstantIntRanges> argRanges,
   setResultRanges(getResult(), argRanges.front());
 }
 
-//===----------------------------------------------------------------------===//
-// WarpExecuteOnLane0Op
-//===----------------------------------------------------------------------===//
-
-void WarpExecuteOnLane0Op::print(OpAsmPrinter &p) {
-  p << "(" << getLaneid() << ")";
-
-  SmallVector<StringRef> coreAttr = {getWarpSizeAttrName()};
-  auto warpSizeAttr = getOperation()->getAttr(getWarpSizeAttrName());
-  p << "[" << llvm::cast<IntegerAttr>(warpSizeAttr).getInt() << "]";
-
-  if (!getArgs().empty())
-    p << " args(" << getArgs() << " : " << getArgs().getTypes() << ")";
-  if (!getResults().empty())
-    p << " -> (" << getResults().getTypes() << ')';
-  p << " ";
-  p.printRegion(getRegion(),
-                /*printEntryBlockArgs=*/true,
-                /*printBlockTerminators=*/!getResults().empty());
-  p.printOptionalAttrDict(getOperation()->getAttrs(), coreAttr);
-}
-
-ParseResult WarpExecuteOnLane0Op::parse(OpAsmParser &parser,
-                                        OperationState &result) {
-  // Create the region.
-  result.regions.reserve(1);
-  Region *warpRegion = result.addRegion();
-
-  auto &builder = parser.getBuilder();
-  OpAsmParser::UnresolvedOperand laneId;
-
-  // Parse predicate operand.
-  if (parser.parseLParen() ||
-      parser.parseOperand(laneId, /*allowResultNumber=*/false) ||
-      parser.parseRParen())
-    return failure();
-
-  int64_t warpSize;
-  if (parser.parseLSquare() || parser.parseInteger(warpSize) ||
-      parser.parseRSquare())
-    return failure();
-  result.addAttribute(getWarpSizeAttrName(OperationName(getOperationName(),
-                                                        builder.getContext())),
-                      builder.getI64IntegerAttr(warpSize));
-
-  if (parser.resolveOperand(laneId, builder.getIndexType(), result.operands))
-    return failure();
-
-  llvm::SMLoc inputsOperandsLoc;
-  SmallVector<OpAsmParser::UnresolvedOperand> inputsOperands;
-  SmallVector<Type> inputTypes;
-  if (succeeded(parser.parseOptionalKeyword("args"))) {
-    if (parser.parseLParen())
-      return failure();
-
-    inputsOperandsLoc = parser.getCurrentLocation();
-    if (parser.parseOperandList(inputsOperands) ||
-        parser.parseColonTypeList(inputTypes) || parser.parseRParen())
-      return failure();
-  }
-  if (parser.resolveOperands(inputsOperands, inputTypes, inputsOperandsLoc,
-                             result.operands))
-    return failure();
-
-  // Parse optional results type list.
-  if (parser.parseOptionalArrowTypeList(result.types))
-    return failure();
-  // Parse the region.
-  if (parser.parseRegion(*warpRegion, /*arguments=*/{},
-                         /*argTypes=*/{}))
-    return failure();
-  WarpExecuteOnLane0Op::ensureTerminator(*warpRegion, builder, result.location);
-
-  // Parse the optional attribute list.
-  if (parser.parseOptionalAttrDict(result.attributes))
-    return failure();
-  return success();
-}
-
-void WarpExecuteOnLane0Op::getSuccessorRegions(
-    RegionBranchPoint point, SmallVectorImpl<RegionSuccessor> &regions) {
-  if (!point.isParent()) {
-    regions.push_back(RegionSuccessor(getResults()));
-    return;
-  }
-
-  // The warp region is always executed
-  regions.push_back(RegionSuccessor(&getWarpRegion()));
-}
-
-void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result,
-                                 TypeRange resultTypes, Value laneId,
-                                 int64_t warpSize) {
-  build(builder, result, resultTypes, laneId, warpSize,
-        /*operands=*/std::nullopt, /*argTypes=*/std::nullopt);
-}
-
-void WarpExecuteOnLane0Op::build(OpBuilder &builder, OperationState &result,
-                                 TypeRange resultTypes, Value laneId,
-                                 int64_t warpSize, ValueRange args,
-                                 TypeRange blockArgTypes) {
-  result.addOperands(laneId);
-  result.addAttribute(getAttributeNames()[0],
-                      builder.getI64IntegerAttr(warpSize));
-  result.addTypes(resultTypes);
-  result.addOperands(args);
-  assert(args.size() == blockArgTypes.size());
-  OpBuilder::InsertionGuard guard(builder);
-  Region *warpRegion = result.addRegion();
-  Block *block = builder.createBlock(warpRegion);
-  for (auto [type, arg] : llvm::zip_equal(blockArgTypes, args))
-    block->addArgument(type, arg.getLoc());
-}
-
-/// Helper check if the distributed vector type is consistent with the expanded
-/// type and distributed size.
-static LogicalResult verifyDistributedType(Type expanded, Type distributed,
-                                           int64_t warpSize, Operation *op) {
-  // If the types matches there is no distribution.
-  if (expanded == distributed)
-    return success();
-  auto expandedVecType = llvm::dyn_cast<VectorType>(expanded);
-  auto distributedVecType = llvm::dyn_cast<VectorType>(distributed);
-  if (!expandedVecType || !distributedVecType)
-    return op->emitOpError("expected vector type for distributed operands.");
-  if (expandedVecType.getRank() != distributedVecType.getRank() ||
-      expandedVecType.getElementType() != distributedVecType.getElementType())
-    return op->emitOpError(
-        "expected distributed vectors to have same rank and element type.");
-
-  SmallVector<int64_t> scales(expandedVecType.getRank(), 1);
-  for (int64_t i = 0, e = expandedVecType.getRank(); i < e; i++) {
-    int64_t eDim = expandedVecType.getDimSize(i);
-    int64_t dDim = distributedVecType.getDimSize(i);
-    if (eDim == dDim)
-      continue;
-    if (eDim % dDim != 0)
-      return op->emitOpError()
-             << "expected expanded vector dimension #" << i << " (" << eDim
-             << ") to be a multipler of the distributed vector dimension ("
-             << dDim << ")";
-    scales[i] = eDim / dDim;
-  }
-  if (std::accumulate(scales.begin(), scales.end(), 1,
-                      std::multiplies<int64_t>()) != warpSize)
-    return op->emitOpError()
-           << "incompatible distribution dimensions from " << expandedVecType
-           << " to " << distributedVecType << " with warp size = " << warpSize;
-
-  return success();
-}
-
-LogicalResult WarpExecuteOnLane0Op::verify() {
-  if (getArgs().size() != getWarpRegion().getNumArguments())
-    return emitOpError(
-        "expected same number op arguments and block arguments.");
-  auto yield =
-      cast<YieldOp>(getWarpRegion().getBlocks().begin()->getTerminator());
-  if (yield.getNumOperands() != getNumResults())
-    return emitOpError(
-        "expected same number of yield operands and return values.");
-  int64_t warpSize = getWarpSize();
-  for (auto [regionArg, arg] :
-       llvm::zip_equal(getWarpRegion().getArguments(), getArgs())) {
-    if (failed(verifyDistributedType(regionArg.getType(), arg.getType(),
-                                     warpSize, getOperation())))
-      return failure();
-  }
-  for (auto [yieldOperand, result] :
-       llvm::zip_equal(yield.getOperands(), getResults())) {
-    if (failed(verifyDistributedType(yieldOperand.getType(), result.getType(),
-                                     warpSize, getOperation())))
-      return failure();
-  }
-  return success();
-}
-
-bool WarpExecuteOnLane0Op::areTypesCompatible(Type lhs, Type rhs) {
-  return succeeded(
-      verifyDistributedType(lhs, rhs, getWarpSize(), getOperation()));
-}
-
 Value mlir::vector::makeArithReduction(OpBuilder &b, Location loc,
                                        CombiningKind kind, Value v1, Value acc,
                                        arith::FastMathFlagsAttr fastmath,
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index dc5eb2527f949a..3e142598369951 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -22,14 +23,15 @@
 
 using namespace mlir;
 using namespace mlir::vector;
+using namespace mlir::gpu;
 
 /// Currently the distribution map is implicit based on the vector shape. In the
 /// future it will be part of the op.
 /// Example:
 /// ```
-/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) {
+/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1x16x2xf32>) {
 ///   ...
-///   vector.yield %3 : vector<32x16x64xf32>
+///   gpu.yield %3 : vector<32x16x64xf32>
 /// }
 /// ```
 /// Would have an implicit map of:
@@ -117,13 +119,13 @@ struct DistributedLoadStoreHelper {
   ///   2. vectors of type V<shapexT> transit through a memref<shapexT>
   ///
   /// When broadcastMode is true, the load is not distributed to account for
-  /// the broadcast semantics of the `vector.warp_execute_on_lane_0` op.
+  /// the broadcast semantics of the `gpu.warp_execute_on_lane_0` op.
   ///
   /// Example:
   ///
   /// ```
-  ///   %r = vector.warp_execute_on_lane_0(...) -> (f32) {
-  ///     vector.yield %cst : f32
+  ///   %r = gpu.warp_execute_on_lane_0(...) -> (f32) {
+  ///     gpu.yield %cst : f32
   ///   }
   ///   // Both types are f32. The constant %cst is broadcasted to all lanes.
   /// ```
@@ -180,10 +182,10 @@ static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndReplaceReturns(
          "expected WarpOp with single block");
 
   auto yield =
-      cast<vector::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
+      cast<gpu::YieldOp>(newOpBody.getBlocks().begin()->getTerminator());
 
   rewriter.modifyOpInPlace(
-      yield, [&]() { yield.getOperandsMutable().assign(newYieldedValues); });
+      yield, [&]() { yield.getValuesMutable().assign(newYieldedValues); });
   return newWarpOp;
 }
 
@@ -195,7 +197,7 @@ static WarpExecuteOnLane0Op moveRegionToNewWarpOpAndAppendReturns(
     llvm::SmallVector<size_t> &indices) {
   SmallVector<Type> types(warpOp.getResultTypes().begin(),
                           warpOp.getResultTypes().end());
-  auto yield = cast<vector::YieldOp>(
+  auto yield = cast<gpu::YieldOp>(
       warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
   llvm::SmallSetVector<Value, 32> yieldValues(yield.getOperands().begin(),
                                               yield.getOperands().end());
@@ -233,7 +235,7 @@ static bool canBeHoisted(Operation *op,
 /// condition and is not dead.
 static OpOperand *getWarpResult(WarpExecuteOnLane0Op warpOp,
                                 const std::function<bool(Operation *)> &fn) {
-  auto yield = cast<vector::YieldOp>(
+  auto yield = cast<gpu::YieldOp>(
       warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
   for (OpOperand &yieldOperand : yield->getOpOperands()) {
     Value yieldValues = yieldOperand.get();
@@ -348,7 +350,7 @@ struct WarpOpToScfIfPattern : public OpRewritePattern<WarpExecuteOnLane0Op> {
     // TODO: at this point, we can reuse the shared memory from previous
     // buffers.
     SmallVector<Value> replacements;
-    auto yieldOp = cast<vector::YieldOp>(ifOp.thenBlock()->getTerminator());
+    auto yieldOp = cast<gpu::YieldOp>(ifOp.thenBlock()->getTerminator());
     Location yieldLoc = yieldOp.getLoc();
     for (const auto &it : llvm::enumerate(yieldOp.getOperands())) {
       Value sequentialVal = it.value();
@@ -370,8 +372,8 @@ struct WarpOpToScfIfPattern : public OpRewritePattern<WarpExecuteOnLane0Op> {
       rewriter.setInsertionPointAfter(ifOp);
       // Result type and yielded value type are the same. This is a broadcast.
       // E.g.:
-      // %r = vector.warp_execute_on_lane_0(...) -> (f32) {
-      //   vector.yield %cst : f32
+      // %r = gpu.warp_execute_on_lane_0(...) -> (f32) {
+      //   gpu.yield %cst : f32
       // }
       // Both types are f32. The constant %cst is broadcasted to all lanes.
       // This is described in more detail in the documentation of the op.
@@ -472,17 +474,17 @@ static VectorType getDistributedType(VectorType originalType, AffineMap map,
 ///
 /// Example:
 /// ```
-/// %0 = vector.warp_execute_on_lane_0(%id){
+/// %0 = gpu.warp_execute_on_lane_0(%id){
 ///   ...
 ///   vector.transfer_write %v, %A[%c0] : vector<32xf32>, memref<128xf32>
-///   vector.yield
+///   gpu.yield
 /// }
 /// ```
 /// To
 /// ```
-/// %r:3 = vector.warp_execute_on_lane_0(%id) -> (vector<1xf32>) {
+/// %r:3 = gpu.warp_execute_on_lane_0(%id) -> (vector<1xf32>) {
 ///   ...
-///   vector.yield %v : vector<32xf32>
+///   gpu.yield %v : vector<32xf32>
 /// }
 /// vector.transfer_write %v, %A[%id] : vector<1xf32>, memref<128xf32>
 struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
@@ -598,7 +600,7 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
 
     // Do not process warp ops that contain only TransferWriteOps.
     if (llvm::all_of(warpOp.getOps(),
-                     llvm::IsaPred<vector::TransferWriteOp, vector::YieldOp>))
+                     llvm::IsaPred<vector::TransferWriteOp, gpu::YieldOp>))
       return failure();
 
     SmallVector<Value> yieldValues = {writeOp.getVector()};
@@ -617,13 +619,13 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
         cast<vector::TransferWriteOp>(rewriter.clone(*writeOp.getOperation()));
     newWriteOp.getVectorMutable().assign(newWarpOp.getResult(newRetIndices[0]));
     rewriter.eraseOp(writeOp);
-    rewriter.create<vector::YieldOp>(newWarpOp.getLoc());
+    rewriter.create<gpu::YieldOp>(newWarpOp.getLoc());
     return success();
   }
 
   LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    auto yield = cast<vector::YieldOp>(
+    auto yield = cast<gpu::YieldOp>(
         warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Operation *lastNode = yield->getPrevNode();
     auto writeOp = dyn_cast_or_null<vector::TransferWriteOp>(lastNode);
@@ -658,19 +660,19 @@ struct WarpOpTransferWrite : public OpRewritePattern<WarpExecuteOnLane0Op> {
 
 /// Sink out elementwise op feeding into a warp op yield.
 /// ```
-/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
+/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
 ///   ...
 ///   %3 = arith.addf %1, %2 : vector<32xf32>
-///   vector.yield %3 : vector<32xf32>
+///   gpu.yield %3 : vector<32xf32>
 /// }
 /// ```
 /// To
 /// ```
-/// %r:3 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>,
+/// %r:3 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>,
 /// vector<1xf32>, vector<1xf32>) {
 ///   ...
 ///   %4 = arith.addf %2, %3 : vector<32xf32>
-///   vector.yield %4, %2, %3 : vector<32xf32>, vector<32xf32>,
+///   gpu.yield %4, %2, %3 : vector<32xf32>, vector<32xf32>,
 ///   vector<32xf32>
 /// }
 /// %0 = arith.addf %r#1, %r#2 : vector<1xf32>
@@ -728,15 +730,15 @@ struct WarpOpElementwise : public OpRewritePattern<WarpExecuteOnLane0Op> {
 
 /// Sink out splat constant op feeding into a warp op yield.
 /// ```
-/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
+/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
 ///   ...
 ///   %cst = arith.constant dense<2.0> : vector<32xf32>
-///   vector.yield %cst : vector<32xf32>
+///   gpu.yield %cst : vector<32xf32>
 /// }
 /// ```
 /// To
 /// ```
-/// vector.warp_execute_on_lane_0(%arg0 {
+/// gpu.warp_execute_on_lane_0(%arg0 {
 ///   ...
 /// }
 /// %0 = arith.constant dense<2.0> : vector<1xf32>
@@ -821,20 +823,20 @@ bool delinearizeLaneId(OpBuilder &builder, Location loc,
 
 /// Sink out transfer_read op feeding into a warp op yield.
 /// ```
-/// %0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
+/// %0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
 ///   ...
 //    %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>,
 //    vector<32xf32>
-///   vector.yield %2 : vector<32xf32>
+///   gpu.yield %2 : vector<32xf32>
 /// }
 /// ```
 /// To
 /// ```
-/// %dead = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>,
+/// %dead = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>,
 /// vector<1xf32>, vector<1xf32>) {
 ///   ...
 ///   %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>,
-///   vector<32xf32> vector.yield %2 : vector<32xf32>
+///   vector<32xf32> gpu.yield %2 : vector<32xf32>
 /// }
 /// %0 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<1xf32>
 struct WarpOpTransferRead : public OpRewritePattern<WarpExecuteOnLane0Op> {
@@ -959,7 +961,7 @@ struct WarpOpDeadResult : public OpRewritePattern<WarpExecuteOnLane0Op> {
     newYieldValues.reserve(warpOp->getNumResults());
     DenseMap<Value, int64_t> dedupYieldOperandPositionMap;
     DenseMap<OpResult, int64_t> dedupResultPositionMap;
-    auto yield = cast<vector::YieldOp>(
+    auto yield = cast<gpu::YieldOp>(
         warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
 
     // Some values may be yielded multiple times and correspond to multiple
@@ -1016,7 +1018,7 @@ struct WarpOpForwardOperand : public OpRewritePattern<WarpExecuteOnLane0Op> {
                                 PatternRewriter &rewriter) const override {
     SmallVector<Type> resultTypes;
     SmallVector<Value> yieldValues;
-    auto yield = cast<vector::YieldOp>(
+    auto yield = cast<gpu::YieldOp>(
         warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     Value valForwarded;
     unsigned resultIndex;
@@ -1135,16 +1137,16 @@ struct WarpOpShapeCast : public OpRewritePattern<WarpExecuteOnLane0Op> {
 /// Sink out vector.create_mask op feeding into a warp op yield.
 /// ```
 /// %0 = ...
-/// %1 = vector.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
+/// %1 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<1xf32>) {
 ///   ...
 ///   %mask = vector.create_mask %0 : vector<32xi1>
-///   vector.yield %mask : vector<32xi1>
+///   gpu.yield %mask : vector<32xi1>
 /// }
 /// ```
 /// To
 /// ```
 /// %0 = ...
-/// vector.warp_execute_on_lane_0(%arg0) {
+/// gpu.warp_execute_on_lane_0(%arg0) {
 ///   ...
 /// }
 /// %cmp = arith.cmpi ult, %laneid, %0
@@ -1652,28 +1654,28 @@ struct WarpOpInsertElement : public OpRewritePattern<WarpExecuteOnLane0Op> {
 /// WarpExecuteOnLane0Op. The new scf.for region will contain a new
 /// WarpExecuteOnLane0Op region. Example:
 /// ```
-/// %w = vector.warp_execute_on_lane_0(%laneid) -> (vector<4xf32>) {
+/// %w = gpu.warp_execute_on_lane_0(%laneid) -> (vector<4xf32>) {
 ///   ...
 ///   %v1 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %v)
 ///   -> (vector<128xf32>) {
 ///     ...
 ///     scf.yield %r : vector<128xf32>
 ///   }
-///   vector.yield %v1 : vector<128xf32>
+///   gpu.yield %v1 : vector<128xf32>
 /// }
 /// ```
 /// To:
-/// %w0 = vector.warp_execute_on_lane_0(%arg0) -> (vector<4xf32>) {
+/// %w0 = gpu.warp_execute_on_lane_0(%arg0) -> (vector<4xf32>) {
 ///   ...
-///   vector.yield %v : vector<128xf32>
+///   gpu.yield %v : vector<128xf32>
 /// }
 /// %w = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%varg = %q0)
 ///   -> (vector<4xf32>) {
-///     %iw = vector.warp_execute_on_lane_0(%laneid)
+///     %iw = gpu.warp_execute_on_lane_0(%laneid)
 ///     args(%varg : vector<4xf32>) -> (vector<4xf32>) {
 ///     ^bb0(%arg: vector<128xf32>):
 ///       ...
-///       vector.yield %ir : vector<128xf32>
+///       gpu.yield %ir : vector<128xf32>
 ///     }
 ///     scf.yield %iw : vector<4xf32>
 ///  }
@@ -1686,7 +1688,7 @@ struct WarpOpScfForOp : public OpRewritePattern<WarpExecuteOnLane0Op> {
   using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;
   LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
-    auto yield = cast<vector::YieldOp>(
+    auto yield = cast<gpu::YieldOp>(
         warpOp.getBodyRegion().getBlocks().begin()->getTerminator());
     // Only pick up forOp if it is the last op in the region.
     Operation *lastNode = yield->getPrevNode();
@@ -1722,7 +1724,7 @@ struct WarpOpScfForOp : public OpRewritePattern<WarpExecuteOnLane0Op> {
     WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
         rewriter, warpOp, escapingValues.getArrayRef(), distTypes,
         newRetIndices);
-    yield = cast<vector::YieldOp>(
+    yield = cast<gpu::YieldOp>(
         newWarpOp.getBodyRegion().getBlocks().begin()->getTerminator());
 
     SmallVector<Value> newOperands;
@@ -1774,7 +1776,7 @@ struct WarpOpScfForOp : public OpRewritePattern<WarpExecuteOnLane0Op> {
     rewriter.eraseOp(forOp.getBody()->getTerminator());
     rewriter.mergeBlocks(forOp.getBody(), innerWarp.getBody(), argMapping);
     rewriter.setInsertionPointToEnd(innerWarp.getBody());
-    rewriter.create<vector::YieldOp>(innerWarp.getLoc(), yieldOperands);
+    rewriter.create<gpu::YieldOp>(innerWarp.getLoc(), yieldOperands);
     rewriter.setInsertionPointAfter(innerWarp);
     if (!innerWarp.getResults().empty())
       rewriter.create<scf::YieldOp>(forOp.getLoc(), innerWarp.getResults());
@@ -1807,17 +1809,17 @@ struct WarpOpScfForOp : public OpRewritePattern<WarpExecuteOnLane0Op> {
 /// The vector is reduced in parallel. Currently limited to vector size
 /// matching the warpOp size. E.g.:
 /// ```
-/// %r = vector_ext.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+/// %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
 ///   %0 = "some_def"() : () -> (vector<32xf32>)
 ///   %1 = vector.reduction "add", %0 : vector<32xf32> into f32
-///   vector_ext.yield %1 : f32
+///   gpu.yield %1 : f32
 /// }
 /// ```
 /// is lowered to:
 /// ```
-/// %0 = vector_ext.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+/// %0 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
 ///   %1 = "some_def"() : () -> (vector<32xf32>)
-///   vector_ext.yield %1 : vector<32xf32>
+///   gpu.yield %1 : vector<32xf32>
 /// }
 /// %a = vector.extract %0[0] : f32 from vector<1xf32>
 /// %r = ("warp.reduction %a")
diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
index cd62b7b13fa9ae..2242786fe67595 100644
--- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir
+++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
@@ -2,7 +2,7 @@
 
   func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) {
     %c0 = arith.constant 0 : index
-    vector.warp_execute_on_lane_0(%arg0)[32] {
+    gpu.warp_execute_on_lane_0(%arg0)[32] {
       // CHECK:%[[val:[0-9]+]] = llvm.extractelement
       // CHECK:%[[base:[0-9]+]] = llvm.extractvalue
       // CHECK:%[[ptr:[0-9]+]] = llvm.getelementptr %[[base]]
diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir
index 2a0f7e8c6b10c2..16148a493ce6ea 100644
--- a/mlir/test/Dialect/GPU/invalid.mlir
+++ b/mlir/test/Dialect/GPU/invalid.mlir
@@ -877,3 +877,89 @@ gpu.binary @binary [#gpu.object<#rocdl.target<chip = "gfx900">,
     ]>,
     bin = "BLOB">
   ]
+
+// -----
+
+func.func @warp_wrong_num_outputs(%laneid: index) {
+  // expected-error at +1 {{'gpu.warp_execute_on_lane_0' op expected same number of yield operands and return values.}}
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) {
+  }
+  return
+}
+
+// -----
+
+func.func @warp_wrong_num_inputs(%laneid: index) {
+  // expected-error at +1 {{'gpu.warp_execute_on_lane_0' op expected same number op arguments and block arguments.}}
+  gpu.warp_execute_on_lane_0(%laneid)[64] {
+  ^bb0(%arg0 : vector<128xi32>) :
+  }
+  return
+}
+
+// -----
+
+func.func @warp_wrong_return_distribution(%laneid: index) {
+  // expected-error at +1 {{'gpu.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}}
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) {
+    %0 = arith.constant dense<2>: vector<128xi32>
+    gpu.yield %0 : vector<128xi32>
+  }
+  return
+}
+
+
+// -----
+
+func.func @warp_wrong_arg_distribution(%laneid: index, %v0 : vector<4xi32>) {
+  // expected-error at +1 {{'gpu.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}}
+  gpu.warp_execute_on_lane_0(%laneid)[64]
+  args(%v0 : vector<4xi32>) {
+   ^bb0(%arg0 : vector<128xi32>) :
+  }
+  return
+}
+
+// -----
+
+func.func @warp_2_distributed_dims(%laneid: index) {
+  // expected-error at +1 {{incompatible distribution dimensions from 'vector<128x128xi32>' to 'vector<4x4xi32>' with warp size = 32}}
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) {
+    %0 = arith.constant dense<2>: vector<128x128xi32>
+    gpu.yield %0 : vector<128x128xi32>
+  }
+  return
+}
+
+// -----
+
+func.func @warp_2_distributed_dims(%laneid: index) {
+  // expected-error at +1 {{expected expanded vector dimension #1 (8) to be a multipler of the distributed vector dimension (3)}}
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x3xi32>) {
+    %0 = arith.constant dense<2>: vector<4x8xi32>
+    gpu.yield %0 : vector<4x8xi32>
+  }
+  return
+}
+
+// -----
+
+func.func @warp_mismatch_rank(%laneid: index) {
+  // expected-error at +1 {{'gpu.warp_execute_on_lane_0' op expected distributed vectors to have same rank and element type.}}
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) {
+    %0 = arith.constant dense<2>: vector<128xi32>
+    gpu.yield %0 : vector<128xi32>
+  }
+  return
+}
+
+// -----
+
+func.func @warp_mismatch_rank(%laneid: index) {
+  // expected-error at +1 {{'gpu.warp_execute_on_lane_0' op expected vector type for distributed operands.}}
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (i32) {
+    %0 = arith.constant dense<2>: vector<128xi32>
+    gpu.yield %0 : vector<128xi32>
+  }
+  return
+}
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
index b9c0a0e79e8f2a..c0ff2044b76c40 100644
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -464,3 +464,39 @@ gpu.binary @kernel_attrs_2 [
       ]>,
       bin = "BLOB">
   ]
+
+// CHECK-LABEL:   func @warp_execute_on_lane_0(
+func.func @warp_execute_on_lane_0(%laneid: index) {
+//  CHECK-NEXT:     gpu.warp_execute_on_lane_0(%{{.*}})[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
+//  CHECK-NEXT:     }
+  }
+//  CHECK-NEXT:     return
+  return
+}
+
+// CHECK-LABEL: func.func @warp_execute_on_lane_0_2d
+func.func @warp_execute_on_lane_0_2d(%laneid: index) {
+  //  CHECK: gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>)
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) {
+    %0 = arith.constant dense<2>: vector<4x32xi32>
+    // CHECK: gpu.yield %{{.+}} : vector<4x32xi32>
+    gpu.yield %0 : vector<4x32xi32>
+  }
+  return
+}
+
+// CHECK-LABEL:   func @warp_operand_result(
+func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4xi32>) {
+//  CHECK-NEXT:     %{{.*}} = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) {
+  %2 = gpu.warp_execute_on_lane_0(%laneid)[32]
+  args(%v0 : vector<4xi32>) -> (vector<4xi32>) {
+   ^bb0(%arg0 : vector<128xi32>) :
+    %0 = arith.constant dense<2>: vector<128xi32>
+    %1 = arith.addi %arg0, %0 : vector<128xi32>
+//       CHECK:       gpu.yield %{{.*}} : vector<128xi32>
+    gpu.yield %1 : vector<128xi32>
+//  CHECK-NEXT:     }
+  }
+  return %2 : vector<4xi32>
+}
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index d591c60acb64e7..b3077a38e92c09 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1611,92 +1611,6 @@ func.func @invalid_splat(%v : f32) {
 
 // -----
 
-func.func @warp_wrong_num_outputs(%laneid: index) {
-  // expected-error at +1 {{'vector.warp_execute_on_lane_0' op expected same number of yield operands and return values.}}
-  %2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) {
-  }
-  return
-}
-
-// -----
-
-func.func @warp_wrong_num_inputs(%laneid: index) {
-  // expected-error at +1 {{'vector.warp_execute_on_lane_0' op expected same number op arguments and block arguments.}}
-  vector.warp_execute_on_lane_0(%laneid)[64] {
-  ^bb0(%arg0 : vector<128xi32>) :
-  }
-  return
-}
-
-// -----
-
-func.func @warp_wrong_return_distribution(%laneid: index) {
-  // expected-error at +1 {{'vector.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}}
-  %2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<4xi32>) {
-    %0 = arith.constant dense<2>: vector<128xi32>
-    vector.yield %0 : vector<128xi32>
-  }
-  return
-}
-
-
-// -----
-
-func.func @warp_wrong_arg_distribution(%laneid: index, %v0 : vector<4xi32>) {
-  // expected-error at +1 {{'vector.warp_execute_on_lane_0' op incompatible distribution dimensions from 'vector<128xi32>' to 'vector<4xi32>'}}
-  vector.warp_execute_on_lane_0(%laneid)[64]
-  args(%v0 : vector<4xi32>) {
-   ^bb0(%arg0 : vector<128xi32>) :
-  }
-  return
-}
-
-// -----
-
-func.func @warp_2_distributed_dims(%laneid: index) {
-  // expected-error at +1 {{incompatible distribution dimensions from 'vector<128x128xi32>' to 'vector<4x4xi32>' with warp size = 32}}
-  %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) {
-    %0 = arith.constant dense<2>: vector<128x128xi32>
-    vector.yield %0 : vector<128x128xi32>
-  }
-  return
-}
-
-// -----
-
-func.func @warp_2_distributed_dims(%laneid: index) {
-  // expected-error at +1 {{expected expanded vector dimension #1 (8) to be a multipler of the distributed vector dimension (3)}}
-  %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x3xi32>) {
-    %0 = arith.constant dense<2>: vector<4x8xi32>
-    vector.yield %0 : vector<4x8xi32>
-  }
-  return
-}
-
-// -----
-
-func.func @warp_mismatch_rank(%laneid: index) {
-  // expected-error at +1 {{'vector.warp_execute_on_lane_0' op expected distributed vectors to have same rank and element type.}}
-  %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x4xi32>) {
-    %0 = arith.constant dense<2>: vector<128xi32>
-    vector.yield %0 : vector<128xi32>
-  }
-  return
-}
-
-// -----
-
-func.func @warp_mismatch_rank(%laneid: index) {
-  // expected-error at +1 {{'vector.warp_execute_on_lane_0' op expected vector type for distributed operands.}}
-  %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (i32) {
-    %0 = arith.constant dense<2>: vector<128xi32>
-    vector.yield %0 : vector<128xi32>
-  }
-  return
-}
-
-// -----
-
 func.func @vector_mask_multiple_ops(%t0: tensor<?xf32>, %t1: tensor<?xf32>, %idx: index, %val: vector<16xf32>, %m0: vector<16xi1>) {
   %ft0 = arith.constant 0.0 : f32
   // expected-error at +1 {{'vector.mask' op expects only one operation to mask}}
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 3baacba9b61243..7a0f67590f3ec8 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -942,41 +942,6 @@ func.func @vector_splat_0d(%a: f32) -> vector<f32> {
   return %0 : vector<f32>
 }
 
-// CHECK-LABEL:   func @warp_execute_on_lane_0(
-func.func @warp_execute_on_lane_0(%laneid: index) {
-//  CHECK-NEXT:     vector.warp_execute_on_lane_0(%{{.*}})[32] {
-  vector.warp_execute_on_lane_0(%laneid)[32] {
-//  CHECK-NEXT:     }
-  }
-//  CHECK-NEXT:     return
-  return
-}
-
-// CHECK-LABEL: func.func @warp_execute_on_lane_0_2d
-func.func @warp_execute_on_lane_0_2d(%laneid: index) {
-  //  CHECK: vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x4xi32>)
-  %2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x4xi32>) {
-    %0 = arith.constant dense<2>: vector<4x32xi32>
-    // CHECK: vector.yield %{{.+}} : vector<4x32xi32>
-    vector.yield %0 : vector<4x32xi32>
-  }
-  return
-}
-
-// CHECK-LABEL:   func @warp_operand_result(
-func.func @warp_operand_result(%laneid: index, %v0 : vector<4xi32>) -> (vector<4xi32>) {
-//  CHECK-NEXT:     %{{.*}} = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xi32>) -> (vector<4xi32>) {
-  %2 = vector.warp_execute_on_lane_0(%laneid)[32]
-  args(%v0 : vector<4xi32>) -> (vector<4xi32>) {
-   ^bb0(%arg0 : vector<128xi32>) :
-    %0 = arith.constant dense<2>: vector<128xi32>
-    %1 = arith.addi %arg0, %0 : vector<128xi32>
-//       CHECK:       vector.yield %{{.*}} : vector<128xi32>
-    vector.yield %1 : vector<128xi32>
-//  CHECK-NEXT:     }
-  }
-  return %2 : vector<4xi32>
-}
 
 // CHECK-LABEL: func @vector_mask
 func.func @vector_mask(%a: vector<8xi32>, %m0: vector<8xi1>) -> i32 {
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index b4491812dc26cb..dbe0b39422369c 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -44,7 +44,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
 //   CHECK-SCF-IF-DAG:   %[[buffer_def_1:.*]] = memref.get_global @__shared_64xf32
 
 //       CHECK-SCF-IF:   scf.if %[[is_lane_0]] {
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
       args(%v0, %v1 : vector<4xf32>, vector<8xf32>) -> (vector<1xf32>, vector<2xf32>) {
     ^bb0(%arg0: vector<128xf32>, %arg1: vector<256xf32>):
 //       CHECK-SCF-IF:     %[[arg1:.*]] = vector.transfer_read %[[buffer_v1]][%[[c0]]], %{{.*}} {in_bounds = [true]} : memref<256xf32, 3>, vector<256xf32>
@@ -55,7 +55,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
     %3 = "some_def"(%arg1) : (vector<256xf32>) -> vector<64xf32>
 //       CHECK-SCF-IF:     vector.transfer_write %[[def_0]], %[[buffer_def_0]][%[[c0]]]
 //       CHECK-SCF-IF:     vector.transfer_write %[[def_1]], %[[buffer_def_1]][%[[c0]]]
-    vector.yield %2, %3 : vector<32xf32>, vector<64xf32>
+    gpu.yield %2, %3 : vector<32xf32>, vector<64xf32>
   }
 //       CHECK-SCF-IF:   }
 //       CHECK-SCF-IF:   gpu.barrier
@@ -77,17 +77,17 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
 // CHECK-HOIST: memref.subview
 // CHECK-HOIST: memref.subview
 // CHECK-HOIST: memref.subview
-// CHECK-HOIST: vector.warp_execute_on_lane_0
+// CHECK-HOIST: gpu.warp_execute_on_lane_0
 
-//     CHECK-D: %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) {
+//     CHECK-D: %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>, vector<1xf32>) {
 //     CHECK-D:   arith.addf {{.*}} : vector<32xf32>
 //     CHECK-D:   arith.addf {{.*}} : vector<64xf32>
-//     CHECK-D:   vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32>
+//     CHECK-D:   gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, vector<32xf32>
 // CHECK-D-DAG: vector.transfer_write %[[R]]#1, %{{.*}}[%{{.*}}] {in_bounds = [true]} : vector<1xf32>, memref<128xf32
 // CHECK-D-DAG: %[[ID1:.*]] = affine.apply #[[MAP1]]()[%{{.*}}]
 // CHECK-D-DAG: vector.transfer_write %[[R]]#0, %{{.*}}[%[[ID1]]] {in_bounds = [true]} : vector<2xf32>, memref<128xf32
 
-// CHECK-DIST-AND-PROP-NOT: vector.warp_execute_on_lane_0
+// CHECK-DIST-AND-PROP-NOT: gpu.warp_execute_on_lane_0
 // CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32>
 // CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<1xf32>
 // CHECK-DIST-AND-PROP: vector.transfer_read {{.*}} vector<2xf32>
@@ -99,7 +99,7 @@ func.func @rewrite_warp_op_to_scf_if(%laneid: index,
 
 func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>,
            %arg3: memref<1024xf32>, %gid : index) {
-  vector.warp_execute_on_lane_0(%laneid)[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
     %sa = memref.subview %arg1[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
     %sb = memref.subview %arg2[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
     %sc = memref.subview %arg3[%gid] [128] [1] : memref<1024xf32> to memref<128xf32, strided<[1], offset: ?>>
@@ -121,20 +121,20 @@ func.func @warp(%laneid: index, %arg1: memref<1024xf32>, %arg2: memref<1024xf32>
 // -----
 
 // CHECK-D-LABEL: func @warp_extract(
-//       CHECK-D:   %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>)
+//       CHECK-D:   %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1x1xf32>)
 //       CHECK-D:     "test.dummy_op"
 //       CHECK-D:     "test.dummy_op"
-//       CHECK-D:     vector.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32>
+//       CHECK-D:     gpu.yield %{{.*}}, %{{.*}} : vector<1xf32>, vector<1x1xf32>
 //       CHECK-D:   }
-//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
+//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 //       CHECK-D:     vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<1x1xf32>
 //       CHECK-D:   }
-//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
+//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 //       CHECK-D:     vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<1xf32>
 //       CHECK-D:   }
 
 func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
-  vector.warp_execute_on_lane_0(%laneid)[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
     %c0 = arith.constant 0 : index
     %v = "test.dummy_op"() : () -> (vector<1xf32>)
     %v1 = "test.dummy_op"() : () -> (vector<1x1xf32>)
@@ -149,20 +149,20 @@ func.func @warp_extract(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : ind
 // Check that we can distribute writes of the maximum allowed number of elements.
 
 // CHECK-D-LABEL: func @warp_extract_4_elems(
-//       CHECK-D:   %[[WARPOP:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4x1xf32>)
+//       CHECK-D:   %[[WARPOP:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4x1xf32>)
 //       CHECK-D:     "test.dummy_op"
 //       CHECK-D:     "test.dummy_op"
-//       CHECK-D:     vector.yield %{{.*}}, %{{.*}} : vector<4xf32>, vector<4x1xf32>
+//       CHECK-D:     gpu.yield %{{.*}}, %{{.*}} : vector<4xf32>, vector<4x1xf32>
 //       CHECK-D:   }
-//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
+//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 //       CHECK-D:     vector.transfer_write %[[WARPOP]]#1, %{{.*}}[%{{.*}}] {{.*}} : vector<4x1xf32>
 //       CHECK-D:   }
-//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
+//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 //       CHECK-D:     vector.transfer_write %[[WARPOP]]#0, %{{.*}}[%{{.*}}] {{.*}} : vector<4xf32>
 //       CHECK-D:   }
 
 func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
-  vector.warp_execute_on_lane_0(%laneid)[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
     %c0 = arith.constant 0 : index
     %v = "test.dummy_op"() : () -> (vector<4xf32>)
     %v1 = "test.dummy_op"() : () -> (vector<4x1xf32>)
@@ -179,7 +179,7 @@ func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g
 
 // CHECK-D-LABEL: func @warp_extract_5_elems(
 //       CHECK-D:   arith.constant 0 : index
-//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
+//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 //       CHECK-D:     %[[V:.+]] = "test.dummy_op"
 //       CHECK-D:     %[[V1:.+]] = "test.dummy_op"
 //       CHECK-D:     vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<5x1xf32>
@@ -187,7 +187,7 @@ func.func @warp_extract_4_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g
 //       CHECK-D:   }
 
 func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
-  vector.warp_execute_on_lane_0(%laneid)[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
     %c0 = arith.constant 0 : index
     %v = "test.dummy_op"() : () -> (vector<5xf32>)
     %v1 = "test.dummy_op"() : () -> (vector<5x1xf32>)
@@ -204,7 +204,7 @@ func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g
 
 // CHECK-D-LABEL: func @warp_extract_8_elems(
 //       CHECK-D:   arith.constant 0 : index
-//       CHECK-D:   vector.warp_execute_on_lane_0(%{{.*}})[32] {
+//       CHECK-D:   gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 //       CHECK-D:     %[[V:.+]] = "test.dummy_op"
 //       CHECK-D:     %[[V1:.+]] = "test.dummy_op"
 //       CHECK-D:     vector.transfer_write %[[V1]], %{{.*}}[%{{.*}}] {{.*}} : vector<8x1xf32>
@@ -212,7 +212,7 @@ func.func @warp_extract_5_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g
 //       CHECK-D:   }
 
 func.func @warp_extract_8_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %gid : index) {
-  vector.warp_execute_on_lane_0(%laneid)[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
     %c0 = arith.constant 0 : index
     %v = "test.dummy_op"() : () -> (vector<8xf32>)
     %v1 = "test.dummy_op"() : () -> (vector<8x1xf32>)
@@ -226,14 +226,14 @@ func.func @warp_extract_8_elems(%laneid: index, %arg1: memref<1024x1024xf32>, %g
 
 // CHECK-PROP-LABEL:   func @warp_dead_result(
 func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) {
-  // CHECK-PROP: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>)
-  %r:3 = vector.warp_execute_on_lane_0(%laneid)[32] ->
+  // CHECK-PROP: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>)
+  %r:3 = gpu.warp_execute_on_lane_0(%laneid)[32] ->
     (vector<1xf32>, vector<1xf32>, vector<1xf32>) {
     %2 = "some_def"() : () -> (vector<32xf32>)
     %3 = "some_def"() : () -> (vector<32xf32>)
     %4 = "some_def"() : () -> (vector<32xf32>)
-  // CHECK-PROP:   vector.yield %{{.*}} : vector<32xf32>
-    vector.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32>
+  // CHECK-PROP:   gpu.yield %{{.*}} : vector<32xf32>
+    gpu.yield %2, %3, %4 : vector<32xf32>, vector<32xf32>, vector<32xf32>
   }
   // CHECK-PROP: return %[[R]] : vector<1xf32>
   return %r#1 : vector<1xf32>
@@ -245,10 +245,10 @@ func.func @warp_dead_result(%laneid: index) -> (vector<1xf32>) {
 //  CHECK-PROP-SAME:   %[[ID:.*]]: index, %[[V:.*]]: vector<4xf32>)
 func.func @warp_propagate_operand(%laneid: index, %v0: vector<4xf32>)
   -> (vector<4xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32]
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32]
      args(%v0 : vector<4xf32>) -> (vector<4xf32>) {
      ^bb0(%arg0 : vector<128xf32>) :
-    vector.yield %arg0 : vector<128xf32>
+    gpu.yield %arg0 : vector<128xf32>
   }
   // CHECK-PROP: return %[[V]] : vector<4xf32>
   return %r : vector<4xf32>
@@ -263,21 +263,21 @@ func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) {
   %c0 = arith.constant 0 : index
   %c32 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  // CHECK-PROP: %[[R:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>)
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->
+  // CHECK-PROP: %[[R:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>, vector<2xf32>, vector<2xf32>)
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] ->
     (vector<1xf32>, vector<2xf32>) {
     // CHECK-PROP: %[[V0:.*]] = "some_def"() : () -> vector<32xf32>
     // CHECK-PROP: %[[V1:.*]] = "some_def"() : () -> vector<32xf32>
     // CHECK-PROP: %[[V2:.*]] = "some_def"() : () -> vector<64xf32>
     // CHECK-PROP: %[[V3:.*]] = "some_def"() : () -> vector<64xf32>
-    // CHECK-PROP: vector.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32>
+    // CHECK-PROP: gpu.yield %[[V0]], %[[V1]], %[[V2]], %[[V3]] : vector<32xf32>, vector<32xf32>, vector<64xf32>, vector<64xf32>
     %2 = "some_def"() : () -> (vector<32xf32>)
     %3 = "some_def"() : () -> (vector<32xf32>)
     %4 = "some_def"() : () -> (vector<64xf32>)
     %5 = "some_def"() : () -> (vector<64xf32>)
     %6 = arith.addf %2, %3 : vector<32xf32>
     %7 = arith.addf %4, %5 : vector<64xf32>
-    vector.yield %6, %7 : vector<32xf32>, vector<64xf32>
+    gpu.yield %6, %7 : vector<32xf32>, vector<64xf32>
   }
   // CHECK-PROP: %[[A0:.*]] = arith.addf %[[R]]#2, %[[R]]#3 : vector<2xf32>
   // CHECK-PROP: %[[A1:.*]] = arith.addf %[[R]]#0, %[[R]]#1 : vector<1xf32>
@@ -292,18 +292,18 @@ func.func @warp_propagate_elementwise(%laneid: index, %dest: memref<1024xf32>) {
 // -----
 
 // CHECK-PROP-LABEL: func @warp_propagate_scalar_arith(
-//       CHECK-PROP:   %[[r:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} {
+//       CHECK-PROP:   %[[r:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} {
 //       CHECK-PROP:     %[[some_def0:.*]] = "some_def"
 //       CHECK-PROP:     %[[some_def1:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[some_def0]], %[[some_def1]]
+//       CHECK-PROP:     gpu.yield %[[some_def0]], %[[some_def1]]
 //       CHECK-PROP:   }
 //       CHECK-PROP:   arith.addf %[[r]]#0, %[[r]]#1 : f32
 func.func @warp_propagate_scalar_arith(%laneid: index) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (f32)
     %1 = "some_def"() : () -> (f32)
     %2 = arith.addf %0, %1 : f32
-    vector.yield %2 : f32
+    gpu.yield %2 : f32
   }
   vector.print %r : f32
   return
@@ -312,13 +312,13 @@ func.func @warp_propagate_scalar_arith(%laneid: index) {
 // -----
 
 // CHECK-PROP-LABEL: func @warp_propagate_cast(
-//   CHECK-PROP-NOT:   vector.warp_execute_on_lane_0
+//   CHECK-PROP-NOT:   gpu.warp_execute_on_lane_0
 //       CHECK-PROP:   %[[result:.*]] = arith.sitofp %{{.*}} : i32 to f32
 //       CHECK-PROP:   return %[[result]]
 func.func @warp_propagate_cast(%laneid : index, %i : i32) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %casted = arith.sitofp %i : i32 to f32
-    vector.yield %casted : f32
+    gpu.yield %casted : f32
   }
   return %r : f32
 }
@@ -341,10 +341,10 @@ func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: me
   %c0 = arith.constant 0 : index
   %c32 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) {
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] ->(vector<1xf32>, vector<2xf32>) {
     %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
     %3 = vector.transfer_read %src[%c32], %cst : memref<1024xf32>, vector<64xf32>
-    vector.yield %2, %3 : vector<32xf32>, vector<64xf32>
+    gpu.yield %2, %3 : vector<32xf32>, vector<64xf32>
   }
   %id2 = affine.apply #map0()[%laneid]
   vector.transfer_write %r#0, %dest[%laneid] : vector<1xf32>, memref<1024xf32>
@@ -355,15 +355,15 @@ func.func @warp_propagate_read(%laneid: index, %src: memref<1024xf32>, %dest: me
 // -----
 
 // CHECK-PROP-LABEL: func @fold_vector_broadcast(
-//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
+//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
 //       CHECK-PROP:     %[[some_def:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[some_def]] : vector<1xf32>
+//       CHECK-PROP:     gpu.yield %[[some_def]] : vector<1xf32>
 //       CHECK-PROP:   vector.print %[[r]] : vector<1xf32>
 func.func @fold_vector_broadcast(%laneid: index) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
     %0 = "some_def"() : () -> (vector<1xf32>)
     %1 = vector.broadcast %0 : vector<1xf32> to vector<32xf32>
-    vector.yield %1 : vector<32xf32>
+    gpu.yield %1 : vector<32xf32>
   }
   vector.print %r : vector<1xf32>
   return
@@ -372,16 +372,16 @@ func.func @fold_vector_broadcast(%laneid: index) {
 // -----
 
 // CHECK-PROP-LABEL: func @extract_vector_broadcast(
-//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
+//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
 //       CHECK-PROP:     %[[some_def:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[some_def]] : vector<1xf32>
+//       CHECK-PROP:     gpu.yield %[[some_def]] : vector<1xf32>
 //       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : vector<1xf32> to vector<2xf32>
 //       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
 func.func @extract_vector_broadcast(%laneid: index) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
     %0 = "some_def"() : () -> (vector<1xf32>)
     %1 = vector.broadcast %0 : vector<1xf32> to vector<64xf32>
-    vector.yield %1 : vector<64xf32>
+    gpu.yield %1 : vector<64xf32>
   }
   vector.print %r : vector<2xf32>
   return
@@ -390,16 +390,16 @@ func.func @extract_vector_broadcast(%laneid: index) {
 // -----
 
 // CHECK-PROP-LABEL: func @extract_scalar_vector_broadcast(
-//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (f32)
+//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (f32)
 //       CHECK-PROP:     %[[some_def:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[some_def]] : f32
+//       CHECK-PROP:     gpu.yield %[[some_def]] : f32
 //       CHECK-PROP:   %[[broadcasted:.*]] = vector.broadcast %[[r]] : f32 to vector<2xf32>
 //       CHECK-PROP:   vector.print %[[broadcasted]] : vector<2xf32>
 func.func @extract_scalar_vector_broadcast(%laneid: index) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>) {
     %0 = "some_def"() : () -> (f32)
     %1 = vector.broadcast %0 : f32 to vector<64xf32>
-    vector.yield %1 : vector<64xf32>
+    gpu.yield %1 : vector<64xf32>
   }
   vector.print %r : vector<2xf32>
   return
@@ -408,16 +408,16 @@ func.func @extract_scalar_vector_broadcast(%laneid: index) {
 // -----
 
 // CHECK-PROP-LABEL:   func @warp_scf_for(
-// CHECK-PROP: %[[INI:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) {
+// CHECK-PROP: %[[INI:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>) {
 // CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
-// CHECK-PROP:   vector.yield %[[INI1]] : vector<128xf32>
+// CHECK-PROP:   gpu.yield %[[INI1]] : vector<128xf32>
 // CHECK-PROP: }
 // CHECK-PROP: %[[F:.*]] = scf.for %[[IT:.+]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]) -> (vector<4xf32>) {
 // CHECK-PROP:   %[[A:.*]] = arith.addi %[[IT]], %{{.*}} : index
-// CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) {
+// CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]] : vector<4xf32>) -> (vector<4xf32>) {
 // CHECK-PROP:    ^bb0(%[[ARG:.*]]: vector<128xf32>):
 // CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[A]], %[[ARG]]) : (index, vector<128xf32>) -> vector<128xf32>
-// CHECK-PROP:      vector.yield %[[ACC]] : vector<128xf32>
+// CHECK-PROP:      gpu.yield %[[ACC]] : vector<128xf32>
 // CHECK-PROP:   }
 // CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
 // CHECK-PROP: }
@@ -426,14 +426,14 @@ func.func @warp_scf_for(%arg0: index) {
   %c128 = arith.constant 128 : index
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
     %ini = "some_def"() : () -> (vector<128xf32>)
     %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
       %add = arith.addi %arg3, %c1 : index
       %acc = "some_def"(%add, %arg4) : (index, vector<128xf32>) -> (vector<128xf32>)
       scf.yield %acc : vector<128xf32>
     }
-    vector.yield %3 : vector<128xf32>
+    gpu.yield %3 : vector<128xf32>
   }
   "some_use"(%0) : (vector<4xf32>) -> ()
   return
@@ -442,16 +442,16 @@ func.func @warp_scf_for(%arg0: index) {
 // -----
 
 // CHECK-PROP-LABEL:   func @warp_scf_for_use_from_above(
-// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
+// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
 // CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
 // CHECK-PROP:   %[[USE:.*]] = "some_def_above"() : () -> vector<128xf32>
-// CHECK-PROP:   vector.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32>
+// CHECK-PROP:   gpu.yield %[[INI1]], %[[USE]] : vector<128xf32>, vector<128xf32>
 // CHECK-PROP: }
 // CHECK-PROP: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG:.*]] = %[[INI]]#0) -> (vector<4xf32>) {
-// CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) {
+// CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG]], %[[INI]]#1 : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>) {
 // CHECK-PROP:    ^bb0(%[[ARG0:.*]]: vector<128xf32>, %[[ARG1:.*]]: vector<128xf32>):
 // CHECK-PROP:      %[[ACC:.*]] = "some_def"(%[[ARG0]], %[[ARG1]]) : (vector<128xf32>, vector<128xf32>) -> vector<128xf32>
-// CHECK-PROP:      vector.yield %[[ACC]] : vector<128xf32>
+// CHECK-PROP:      gpu.yield %[[ACC]] : vector<128xf32>
 // CHECK-PROP:   }
 // CHECK-PROP:   scf.yield %[[W]] : vector<4xf32>
 // CHECK-PROP: }
@@ -460,14 +460,14 @@ func.func @warp_scf_for_use_from_above(%arg0: index) {
   %c128 = arith.constant 128 : index
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  %0 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
+  %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) {
     %ini = "some_def"() : () -> (vector<128xf32>)
     %use_from_above = "some_def_above"() : () -> (vector<128xf32>)
     %3 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini) -> (vector<128xf32>) {
       %acc = "some_def"(%arg4, %use_from_above) : (vector<128xf32>, vector<128xf32>) -> (vector<128xf32>)
       scf.yield %acc : vector<128xf32>
     }
-    vector.yield %3 : vector<128xf32>
+    gpu.yield %3 : vector<128xf32>
   }
   "some_use"(%0) : (vector<4xf32>) -> ()
   return
@@ -476,17 +476,17 @@ func.func @warp_scf_for_use_from_above(%arg0: index) {
 // -----
 
 // CHECK-PROP-LABEL:   func @warp_scf_for_swap(
-// CHECK-PROP: %[[INI:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
+// CHECK-PROP: %[[INI:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) {
 // CHECK-PROP:   %[[INI1:.*]] = "some_def"() : () -> vector<128xf32>
 // CHECK-PROP:   %[[INI2:.*]] = "some_def"() : () -> vector<128xf32>
-// CHECK-PROP:   vector.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32>
+// CHECK-PROP:   gpu.yield %[[INI1]], %[[INI2]] : vector<128xf32>, vector<128xf32>
 // CHECK-PROP: }
 // CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[FARG1:.*]] = %[[INI]]#0, %[[FARG2:.*]] = %[[INI]]#1) -> (vector<4xf32>, vector<4xf32>) {
-// CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
+// CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%[[FARG1]], %[[FARG2]] : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) {
 // CHECK-PROP:    ^bb0(%[[ARG1:.*]]: vector<128xf32>, %[[ARG2:.*]]: vector<128xf32>):
 // CHECK-PROP:      %[[ACC1:.*]] = "some_def"(%[[ARG1]]) : (vector<128xf32>) -> vector<128xf32>
 // CHECK-PROP:      %[[ACC2:.*]] = "some_def"(%[[ARG2]]) : (vector<128xf32>) -> vector<128xf32>
-// CHECK-PROP:      vector.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32>
+// CHECK-PROP:      gpu.yield %[[ACC2]], %[[ACC1]] : vector<128xf32>, vector<128xf32>
 // CHECK-PROP:   }
 // CHECK-PROP:   scf.yield %[[W]]#0, %[[W]]#1 : vector<4xf32>, vector<4xf32>
 // CHECK-PROP: }
@@ -496,7 +496,7 @@ func.func @warp_scf_for_swap(%arg0: index) {
   %c128 = arith.constant 128 : index
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  %0:2 = vector.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) {
+  %0:2 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>, vector<4xf32>) {
     %ini1 = "some_def"() : () -> (vector<128xf32>)
     %ini2 = "some_def"() : () -> (vector<128xf32>)
     %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini1, %arg5 = %ini2) -> (vector<128xf32>, vector<128xf32>) {
@@ -504,7 +504,7 @@ func.func @warp_scf_for_swap(%arg0: index) {
       %acc2 = "some_def"(%arg5) : (vector<128xf32>) -> (vector<128xf32>)
       scf.yield %acc2, %acc1 : vector<128xf32>, vector<128xf32>
     }
-    vector.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32>
+    gpu.yield %3#0, %3#1 : vector<128xf32>, vector<128xf32>
   }
   "some_use"(%0#0) : (vector<4xf32>) -> ()
   "some_use"(%0#1) : (vector<4xf32>) -> ()
@@ -515,7 +515,7 @@ func.func @warp_scf_for_swap(%arg0: index) {
 
 // CHECK-PROP-LABEL:   func @warp_scf_for_swap_no_yield(
 // CHECK-PROP:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-// CHECK-PROP-NEXT:        vector.warp_execute_on_lane_0(%{{.*}})[32] {
+// CHECK-PROP-NEXT:        gpu.warp_execute_on_lane_0(%{{.*}})[32] {
 // CHECK-PROP-NEXT:          "some_op"() : () -> ()
 // CHECK-PROP-NEXT:        }
 // CHECK-PROP-NEXT:      }
@@ -523,7 +523,7 @@ func.func @warp_scf_for_swap_no_yield(%arg0: index) {
   %c128 = arith.constant 128 : index
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
-  vector.warp_execute_on_lane_0(%arg0)[32] {
+  gpu.warp_execute_on_lane_0(%arg0)[32] {
     scf.for %arg3 = %c0 to %c128 step %c1 {
       "some_op"() : () -> ()
     }
@@ -538,15 +538,15 @@ func.func @warp_scf_for_swap_no_yield(%arg0: index) {
 #map2 = affine_map<()[s0] -> (s0 * 4 + 128)>
 
 // CHECK-PROP-LABEL:   func @warp_scf_for_multiple_yield(
-//       CHECK-PROP:   vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
+//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
 //  CHECK-PROP-NEXT:     "some_def"() : () -> vector<32xf32>
-//  CHECK-PROP-NEXT:     vector.yield %{{.*}} : vector<32xf32>
+//  CHECK-PROP-NEXT:     gpu.yield %{{.*}} : vector<32xf32>
 //  CHECK-PROP-NEXT:   }
-//   CHECK-PROP-NOT:   vector.warp_execute_on_lane_0
+//   CHECK-PROP-NOT:   gpu.warp_execute_on_lane_0
 //       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
 //       CHECK-PROP:   vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
 //       CHECK-PROP:   %{{.*}}:2 = scf.for {{.*}} -> (vector<4xf32>, vector<4xf32>) {
-//   CHECK-PROP-NOT:     vector.warp_execute_on_lane_0
+//   CHECK-PROP-NOT:     gpu.warp_execute_on_lane_0
 //       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
 //       CHECK-PROP:     vector.transfer_read {{.*}} : memref<?xf32>, vector<4xf32>
 //       CHECK-PROP:     arith.addf {{.*}} : vector<4xf32>
@@ -559,7 +559,7 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
   %c1 = arith.constant 1 : index
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %0:3 = vector.warp_execute_on_lane_0(%arg0)[32] ->
+  %0:3 = gpu.warp_execute_on_lane_0(%arg0)[32] ->
   (vector<1xf32>, vector<4xf32>, vector<4xf32>) {
     %def = "some_def"() : () -> (vector<32xf32>)
     %r1 = vector.transfer_read %arg2[%c0], %cst {in_bounds = [true]} : memref<?xf32>, vector<128xf32>
@@ -574,7 +574,7 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
       %7 = arith.addf %5, %arg5 : vector<128xf32>
       scf.yield %6, %7 : vector<128xf32>, vector<128xf32>
     }
-    vector.yield %def, %3#0, %3#1 :  vector<32xf32>, vector<128xf32>, vector<128xf32>
+    gpu.yield %def, %3#0, %3#1 :  vector<32xf32>, vector<128xf32>, vector<128xf32>
   }
   %1 = affine.apply #map()[%arg0]
   vector.transfer_write %0#1, %arg2[%1] {in_bounds = [true]} : vector<4xf32>, memref<?xf32>
@@ -594,8 +594,8 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
 //   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
 //   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
 //   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
-//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) {
-//       CHECK-PROP:     vector.yield %{{.*}} : vector<32xf32>
+//       CHECK-PROP:   %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) {
+//       CHECK-PROP:     gpu.yield %{{.*}} : vector<32xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[a:.*]] = vector.extract %[[warp_op]][0] : f32 from vector<1xf32>
 //       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
@@ -610,10 +610,10 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref<?xf32>, %arg2
 //       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
 //       CHECK-PROP:   return %[[a4]] : f32
 func.func @vector_reduction(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<32xf32>)
     %1 = vector.reduction <add>, %0 : vector<32xf32> into f32
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -624,11 +624,11 @@ func.func @vector_reduction(%laneid: index) -> (f32) {
 //  CHECK-PROP-SAME:    %[[ID:[a-zA-Z0-9]+]]
 //  CHECK-PROP-SAME:    %[[SRC:[a-zA-Z0-9]+]]
 //  CHECK-PROP-SAME:    %[[DEST:[a-zA-Z0-9]+]]
-//       CHECK-PROP:    vector.warp_execute_on_lane_0(%[[ID]])[32]
+//       CHECK-PROP:    gpu.warp_execute_on_lane_0(%[[ID]])[32]
 //  CHECK-PROP-NEXT:      "some_def"() : () -> vector<4096xf32>
 //  CHECK-PROP-NEXT:      %{{.*}} = vector.reduction
 //       CHECK-PROP:      %[[DEF:.*]] = arith.divf %{{.*}}, %{{.*}} : vector<1xf32>
-//   CHECK-PROP-NOT:      vector.warp_execute_on_lane_0
+//   CHECK-PROP-NOT:      gpu.warp_execute_on_lane_0
 //       CHECK-PROP:      scf.for
 //       CHECK-PROP:        %{{.*}} = arith.subf %{{.*}}, %[[DEF]] : vector<1xf32>
 func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<128xf32>){
@@ -637,7 +637,7 @@ func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<12
   %c1 = arith.constant 1 : index
   %c128 = arith.constant 128 : index
   %f0 = arith.constant 0.000000e+00 : f32
-  vector.warp_execute_on_lane_0(%arg0)[32]{
+  gpu.warp_execute_on_lane_0(%arg0)[32]{
     %cst_1 = arith.constant dense<2.621440e+05> : vector<1xf32>
     %0 = "some_def"() : () -> (vector<4096xf32>)
     %1 = vector.reduction <add>, %0, %cst : vector<4096xf32> into f32
@@ -657,10 +657,10 @@ func.func @warp_distribute(%arg0: index, %src: memref<128xf32>, %dest: memref<12
 func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref<f32>) {
   %c0 = arith.constant 0: index
   %f0 = arith.constant 0.0: f32
-  //     CHECK-D: %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
-  //     CHECK-D: vector.warp_execute_on_lane_0(%{{.*}})[32] {
+  //     CHECK-D: %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
+  //     CHECK-D: gpu.warp_execute_on_lane_0(%{{.*}})[32] {
   //     CHECK-D:   vector.transfer_write %[[R]], %{{.*}}[] : vector<f32>, memref<f32>
-  vector.warp_execute_on_lane_0(%laneid)[32] {
+  gpu.warp_execute_on_lane_0(%laneid)[32] {
     %0 = vector.transfer_read %m0[%c0, %c0, %c0], %f0 {in_bounds = [true]} : memref<4x2x32xf32>, vector<32xf32>
     %1 = vector.transfer_read %m1[], %f0 : memref<f32>, vector<f32>
     %2 = vector.extractelement %1[] : vector<f32>
@@ -682,8 +682,8 @@ func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref
 //   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
 //   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
 //   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
-//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) {
-//       CHECK-PROP:     vector.yield %{{.*}} : vector<64xf32>
+//       CHECK-PROP:   %[[warp_op:.*]] = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>) {
+//       CHECK-PROP:     gpu.yield %{{.*}} : vector<64xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]] : vector<2xf32> into f32
 //       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
@@ -698,10 +698,10 @@ func.func @vector_reduction(%laneid: index, %m0: memref<4x2x32xf32>, %m1: memref
 //       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
 //       CHECK-PROP:   return %[[a4]] : f32
 func.func @vector_reduction_large(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<64xf32>)
     %1 = vector.reduction <add>, %0 : vector<64xf32> into f32
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -716,8 +716,8 @@ func.func @vector_reduction_large(%laneid: index) -> (f32) {
 //   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
 //   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
 //   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
-//       CHECK-PROP:   %[[warp_op:.*]]:2 = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) {
-//       CHECK-PROP:     vector.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32
+//       CHECK-PROP:   %[[warp_op:.*]]:2 = gpu.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<2xf32>, f32) {
+//       CHECK-PROP:     gpu.yield %{{.*}}, %{{.*}} : vector<64xf32>, f32
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[a:.*]] = vector.reduction <add>, %[[warp_op]]#0 : vector<2xf32> into f32
 //       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
@@ -733,11 +733,11 @@ func.func @vector_reduction_large(%laneid: index) -> (f32) {
 //       CHECK-PROP:   %[[a5:.*]] = arith.addf %[[a4]], %[[warp_op]]#1
 //       CHECK-PROP:   return %[[a5]] : f32
 func.func @vector_reduction_acc(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<64xf32>)
     %1 = "some_def"() : () -> (f32)
     %2 = vector.reduction <add>, %0, %1 : vector<64xf32> into f32
-    vector.yield %2 : f32
+    gpu.yield %2 : f32
   }
   return %r : f32
 }
@@ -746,15 +746,15 @@ func.func @vector_reduction_acc(%laneid: index) -> (f32) {
 
 // CHECK-PROP-LABEL:   func @warp_duplicate_yield(
 func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>) {
-  //   CHECK-PROP: %{{.*}}:2 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>)
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) {
+  //   CHECK-PROP: %{{.*}}:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xf32>)
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>, vector<1xf32>) {
     %2 = "some_def"() : () -> (vector<32xf32>)
     %3 = "some_def"() : () -> (vector<32xf32>)
     %4 = arith.addf %2, %3 : vector<32xf32>
     %5 = arith.addf %2, %2 : vector<32xf32>
 // CHECK-PROP-NOT:   arith.addf
-//     CHECK-PROP:   vector.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32>
-    vector.yield %4, %5 : vector<32xf32>, vector<32xf32>
+//     CHECK-PROP:   gpu.yield %{{.*}}, %{{.*}} : vector<32xf32>, vector<32xf32>
+    gpu.yield %4, %5 : vector<32xf32>, vector<32xf32>
   }
   return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
 }
@@ -765,9 +765,9 @@ func.func @warp_duplicate_yield(%laneid: index) -> (vector<1xf32>, vector<1xf32>
 //       CHECK-PROP:   %[[C:.*]] = arith.constant dense<2.000000e+00> : vector<1xf32>
 //       CHECK-PROP:   return %[[C]] : vector<1xf32>
 func.func @warp_constant(%laneid: index) -> (vector<1xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
     %cst = arith.constant dense<2.0> : vector<32xf32>
-    vector.yield %cst : vector<32xf32>
+    gpu.yield %cst : vector<32xf32>
   }
   return %r : vector<1xf32>
 }
@@ -779,18 +779,18 @@ func.func @warp_constant(%laneid: index) -> (vector<1xf32>) {
 // CHECK-PROP-LABEL: func.func @vector_extract_1d(
 //   CHECK-PROP-DAG:   %[[C5_I32:.*]] = arith.constant 5 : i32
 //   CHECK-PROP-DAG:   %[[C1:.*]] = arith.constant 1 : index
-//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) {
+//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<64xf32>
-//       CHECK-PROP:     vector.yield %[[V]] : vector<64xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<64xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][%[[C1]]] : f32 from vector<2xf32>
 //       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[E]], %[[C5_I32]]
 //       CHECK-PROP:   return %[[SHUFFLED]] : f32
 func.func @vector_extract_1d(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<64xf32>)
     %1 = vector.extract %0[9] : f32 from vector<64xf32>
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -798,17 +798,17 @@ func.func @vector_extract_1d(%laneid: index) -> (f32) {
 // -----
 
 // CHECK-PROP-LABEL: func.func @vector_extract_2d(
-//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) {
+//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x3xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[V]] : vector<5x96xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<5x96xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<3xf32> from vector<5x3xf32>
 //       CHECK-PROP:   return %[[E]]
 func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
     %0 = "some_def"() : () -> (vector<5x96xf32>)
     %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32>
-    vector.yield %1 : vector<96xf32>
+    gpu.yield %1 : vector<96xf32>
   }
   return %r : vector<3xf32>
 }
@@ -816,17 +816,17 @@ func.func @vector_extract_2d(%laneid: index) -> (vector<3xf32>) {
 // -----
 
 // CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast_scalar(
-//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
+//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[V]] : vector<5x96xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<5x96xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][1, 2] : f32 from vector<5x96xf32>
 //       CHECK-PROP:   return %[[E]]
 func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<5x96xf32>)
     %1 = vector.extract %0[1, 2] : f32 from vector<5x96xf32>
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -834,17 +834,17 @@ func.func @vector_extract_2d_broadcast_scalar(%laneid: index) -> (f32) {
 // -----
 
 // CHECK-PROP-LABEL: func.func @vector_extract_2d_broadcast(
-//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
+//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<5x96xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[V]] : vector<5x96xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<5x96xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<96xf32> from vector<5x96xf32>
 //       CHECK-PROP:   return %[[E]]
 func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
     %0 = "some_def"() : () -> (vector<5x96xf32>)
     %1 = vector.extract %0[2] : vector<96xf32> from vector<5x96xf32>
-    vector.yield %1 : vector<96xf32>
+    gpu.yield %1 : vector<96xf32>
   }
   return %r : vector<96xf32>
 }
@@ -852,17 +852,17 @@ func.func @vector_extract_2d_broadcast(%laneid: index) -> (vector<96xf32>) {
 // -----
 
 // CHECK-PROP-LABEL: func.func @vector_extract_3d(
-//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) {
+//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<8x4x96xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[V]] : vector<8x128x96xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<8x128x96xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[W]][2] : vector<4x96xf32> from vector<8x4x96xf32>
 //       CHECK-PROP:   return %[[E]]
 func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
     %0 = "some_def"() : () -> (vector<8x128x96xf32>)
     %1 = vector.extract %0[2] : vector<128x96xf32> from vector<8x128x96xf32>
-    vector.yield %1 : vector<128x96xf32>
+    gpu.yield %1 : vector<128x96xf32>
   }
   return %r : vector<4x96xf32>
 }
@@ -870,17 +870,17 @@ func.func @vector_extract_3d(%laneid: index) -> (vector<4x96xf32>) {
 // -----
 
 // CHECK-PROP-LABEL: func.func @vector_extractelement_0d(
-//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
+//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<f32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<f32>
-//       CHECK-PROP:     vector.yield %[[V]] : vector<f32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<f32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][] : f32 from vector<f32>
 //       CHECK-PROP:   return %[[E]] : f32
 func.func @vector_extractelement_0d(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<f32>)
     %1 = vector.extractelement %0[] : vector<f32>
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -888,18 +888,18 @@ func.func @vector_extractelement_0d(%laneid: index) -> (f32) {
 // -----
 
 // CHECK-PROP-LABEL: func.func @vector_extractelement_1element(
-//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
+//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<1xf32>
-//       CHECK-PROP:     vector.yield %[[V]] : vector<1xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<1xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][0] : f32 from vector<1xf32>
 //       CHECK-PROP:   return %[[E]] : f32
 func.func @vector_extractelement_1element(%laneid: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<1xf32>)
     %c0 = arith.constant 0 : index
     %1 = vector.extractelement %0[%c0 : index] : vector<1xf32>
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -911,9 +911,9 @@ func.func @vector_extractelement_1element(%laneid: index) -> (f32) {
 // CHECK-PROP-LABEL: func.func @vector_extractelement_1d(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
 //   CHECK-PROP-DAG:   %[[C32:.*]] = arith.constant 32 : i32
-//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) {
+//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<3xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"
-//       CHECK-PROP:     vector.yield %[[V]] : vector<96xf32>
+//       CHECK-PROP:     gpu.yield %[[V]] : vector<96xf32>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[FROM_LANE:.*]] = affine.apply #[[$map]]()[%[[POS]]]
 //       CHECK-PROP:   %[[DISTR_POS:.*]] = affine.apply #[[$map1]]()[%[[POS]]]
@@ -922,10 +922,10 @@ func.func @vector_extractelement_1element(%laneid: index) -> (f32) {
 //       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[EXTRACTED]], %[[FROM_LANE_I32]], %[[C32]] : f32
 //       CHECK-PROP:   return %[[SHUFFLED]]
 func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
     %0 = "some_def"() : () -> (vector<96xf32>)
     %1 = vector.extractelement %0[%pos : index] : vector<96xf32>
-    vector.yield %1 : f32
+    gpu.yield %1 : f32
   }
   return %r : f32
 }
@@ -935,16 +935,16 @@ func.func @vector_extractelement_1d(%laneid: index, %pos: index) -> (f32) {
 // Index-typed values cannot be shuffled at the moment.
 
 // CHECK-PROP-LABEL: func.func @vector_extractelement_1d_index(
-//       CHECK-PROP:   vector.warp_execute_on_lane_0(%{{.*}})[32] -> (index) {
+//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (index) {
 //       CHECK-PROP:     "some_def"
 //       CHECK-PROP:     vector.extract
-//       CHECK-PROP:     vector.yield {{.*}} : index
+//       CHECK-PROP:     gpu.yield {{.*}} : index
 //       CHECK-PROP:   }
 func.func @vector_extractelement_1d_index(%laneid: index, %pos: index) -> (index) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (index) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (index) {
     %0 = "some_def"() : () -> (vector<96xindex>)
     %1 = vector.extractelement %0[%pos : index] : vector<96xindex>
-    vector.yield %1 : index
+    gpu.yield %1 : index
   }
   return %r : index
 }
@@ -956,14 +956,14 @@ func.func @vector_extractelement_1d_index(%laneid: index, %pos: index) -> (index
 func.func @lane_dependent_warp_propagate_read(
     %laneid: index, %src: memref<1x1024xf32>, %dest: memref<1x1024xf32>) {
   // CHECK-PROP-DAG: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK-PROP-NOT: vector.warp_execute_on_lane_0
+  // CHECK-PROP-NOT: gpu.warp_execute_on_lane_0
   // CHECK-PROP-DAG: %[[R0:.*]] = vector.transfer_read %arg1[%[[C0]], %[[ID]]], %{{.*}} : memref<1x1024xf32>, vector<1x1xf32>
   // CHECK-PROP: vector.transfer_write %[[R0]], {{.*}} : vector<1x1xf32>, memref<1x1024xf32>
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x1xf32>) {
     %2 = vector.transfer_read %src[%c0, %c0], %cst : memref<1x1024xf32>, vector<1x32xf32>
-    vector.yield %2 : vector<1x32xf32>
+    gpu.yield %2 : vector<1x32xf32>
   }
   vector.transfer_write %r, %dest[%c0, %laneid] : vector<1x1xf32>, memref<1x1024xf32>
   return
@@ -974,9 +974,9 @@ func.func @lane_dependent_warp_propagate_read(
 func.func @warp_propagate_read_3d(%laneid: index, %src: memref<32x4x32xf32>) -> vector<1x1x4xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %r = vector.warp_execute_on_lane_0(%laneid)[1024] -> (vector<1x1x4xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<1x1x4xf32>) {
     %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32>
-    vector.yield %2 : vector<32x4x32xf32>
+    gpu.yield %2 : vector<32x4x32xf32>
   }
   return %r : vector<1x1x4xf32>
 }
@@ -997,9 +997,9 @@ func.func @warp_propagate_read_3d(%laneid: index, %src: memref<32x4x32xf32>) ->
 func.func @warp_propagate_read_broadcast(%laneid: index, %src: memref<32x1xf32>) -> vector<1x4xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %r = vector.warp_execute_on_lane_0(%laneid)[512] -> (vector<1x4xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[512] -> (vector<1x4xf32>) {
     %2 = vector.transfer_read %src[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = affine_map<(d0, d1) -> (d0, 0)>} : memref<32x1xf32>, vector<32x64xf32>
-    vector.yield %2 : vector<32x64xf32>
+    gpu.yield %2 : vector<32x64xf32>
   }
   return %r : vector<1x4xf32>
 }
@@ -1020,14 +1020,14 @@ func.func @dont_duplicate_read(
   %laneid: index, %src: memref<1024xf32>) -> vector<1xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-//       CHECK-PROP:   vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
+//       CHECK-PROP:   gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>) {
 //  CHECK-PROP-NEXT:     vector.transfer_read
 //  CHECK-PROP-NEXT:     "blocking_use"
-//  CHECK-PROP-NEXT:     vector.yield
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
+//  CHECK-PROP-NEXT:     gpu.yield
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xf32>) {
     %2 = vector.transfer_read %src[%c0], %cst : memref<1024xf32>, vector<32xf32>
     "blocking_use"(%2) : (vector<32xf32>) -> ()
-    vector.yield %2 : vector<32xf32>
+    gpu.yield %2 : vector<32xf32>
   }
   return %r : vector<1xf32>
 }
@@ -1038,16 +1038,16 @@ func.func @dont_duplicate_read(
 func.func @dedup(%laneid: index, %v0: vector<4xf32>, %v1: vector<4xf32>)
     -> (vector<1xf32>, vector<1xf32>) {
 
-  // CHECK-PROP: %[[SINGLE_RES:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) {
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
+  // CHECK-PROP: %[[SINGLE_RES:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>) {
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
       args(%v0, %v1 : vector<4xf32>, vector<4xf32>) -> (vector<1xf32>, vector<1xf32>) {
     ^bb0(%arg0: vector<128xf32>, %arg1: vector<128xf32>):
 
     // CHECK-PROP: %[[SINGLE_VAL:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>) -> vector<32xf32>
     %2 = "some_def"(%arg0) : (vector<128xf32>) -> vector<32xf32>
 
-    // CHECK-PROP: vector.yield %[[SINGLE_VAL]] : vector<32xf32>
-    vector.yield %2, %2 : vector<32xf32>, vector<32xf32>
+    // CHECK-PROP: gpu.yield %[[SINGLE_VAL]] : vector<32xf32>
+    gpu.yield %2, %2 : vector<32xf32>, vector<32xf32>
   }
 
   // CHECK-PROP: return %[[SINGLE_RES]], %[[SINGLE_RES]] : vector<1xf32>, vector<1xf32>
@@ -1062,7 +1062,7 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v
   // CHECK-SCF-IF-DAG: %[[C0:.*]] = arith.constant 0 : index
 
   // CHECK-SCF-IF: scf.if{{.*}}{
-  %r:4 = vector.warp_execute_on_lane_0(%laneid)[32]
+  %r:4 = gpu.warp_execute_on_lane_0(%laneid)[32]
       args(%s0, %v0, %v1, %v2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) -> (f32, vector<f32>, vector<1xf32>, vector<1x1xf32>) {
     ^bb0(%bs0: f32, %bv0: vector<f32>, %bv1: vector<1xf32>, %bv2: vector<1x1xf32>):
 
@@ -1084,8 +1084,8 @@ func.func @warp_execute_has_broadcast_semantics(%laneid: index, %s0: f32, %v0: v
       %rv1 = "some_def_1"(%bv1) : (vector<1xf32>) -> vector<1xf32>
       %rv2 = "some_def_1"(%bv2) : (vector<1x1xf32>) -> vector<1x1xf32>
 
-      // CHECK-SCF-IF-NOT: vector.yield
-      vector.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
+      // CHECK-SCF-IF-NOT: gpu.yield
+      gpu.yield %rs0, %rv0, %rv1, %rv2 : f32, vector<f32>, vector<1xf32>, vector<1x1xf32>
   }
 
   // CHECK-SCF-IF: gpu.barrier
@@ -1113,7 +1113,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
   // CHECK-SCF-IF:  gpu.barrier
 
   // CHECK-SCF-IF: scf.if{{.*}}{
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32]
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32]
       args(%v0, %v1 : vector<1x64x1xf32>, vector<1x2x128xf32>) -> (vector<1x64x1xf32>, vector<1x2x128xf32>) {
     ^bb0(%arg0: vector<32x64x1xf32>, %arg1: vector<1x64x128xf32>):
 
@@ -1127,8 +1127,8 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
       %r0 = "some_def_0"(%arg0) : (vector<32x64x1xf32>) -> vector<32x64x1xf32>
       %r1 = "some_def_1"(%arg1) : (vector<1x64x128xf32>) -> vector<1x64x128xf32>
 
-      // CHECK-SCF-IF-NOT: vector.yield
-      vector.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
+      // CHECK-SCF-IF-NOT: gpu.yield
+      gpu.yield %r0, %r1 : vector<32x64x1xf32>, vector<1x64x128xf32>
   }
 
   //     CHECK-SCF-IF: gpu.barrier
@@ -1145,7 +1145,7 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
 //       CHECK-PROP:   #[[$MAP1:.*]] = affine_map<()[s0] -> (s0 mod 3)>
 // CHECK-PROP-LABEL: func @vector_insertelement_1d(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
 //       CHECK-PROP:   %[[INSERTING_LANE:.*]] = affine.apply #[[$MAP]]()[%[[POS]]]
 //       CHECK-PROP:   %[[INSERTING_POS:.*]] = affine.apply #[[$MAP1]]()[%[[POS]]]
 //       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[INSERTING_LANE]] : index
@@ -1157,11 +1157,11 @@ func.func @warp_execute_nd_distribute(%laneid: index, %v0: vector<1x64x1xf32>, %
 //       CHECK-PROP:   }
 //       CHECK-PROP:   return %[[R]]
 func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
     %0 = "some_def"() : () -> (vector<96xf32>)
     %f = "another_def"() : () -> (f32)
     %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32>
-    vector.yield %1 : vector<96xf32>
+    gpu.yield %1 : vector<96xf32>
   }
   return %r : vector<3xf32>
 }
@@ -1170,17 +1170,17 @@ func.func @vector_insertelement_1d(%laneid: index, %pos: index) -> (vector<3xf32
 
 // CHECK-PROP-LABEL: func @vector_insertelement_1d_broadcast(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index, %[[POS:.*]]: index
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, f32)
 //       CHECK-PROP:     %[[VEC:.*]] = "some_def"
 //       CHECK-PROP:     %[[VAL:.*]] = "another_def"
-//       CHECK-PROP:     vector.yield %[[VEC]], %[[VAL]]
+//       CHECK-PROP:     gpu.yield %[[VEC]], %[[VAL]]
 //       CHECK-PROP:   vector.insert %[[W]]#1, %[[W]]#0 [%[[POS]]] : f32 into vector<96xf32>
 func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (vector<96xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<96xf32>) {
     %0 = "some_def"() : () -> (vector<96xf32>)
     %f = "another_def"() : () -> (f32)
     %1 = vector.insertelement %f, %0[%pos : index] : vector<96xf32>
-    vector.yield %1 : vector<96xf32>
+    gpu.yield %1 : vector<96xf32>
   }
   return %r : vector<96xf32>
 }
@@ -1188,17 +1188,17 @@ func.func @vector_insertelement_1d_broadcast(%laneid: index, %pos: index) -> (ve
 // -----
 
 // CHECK-PROP-LABEL: func @vector_insertelement_0d(
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<f32>, f32)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<f32>, f32)
 //       CHECK-PROP:     %[[VEC:.*]] = "some_def"
 //       CHECK-PROP:     %[[VAL:.*]] = "another_def"
-//       CHECK-PROP:     vector.yield %[[VEC]], %[[VAL]]
+//       CHECK-PROP:     gpu.yield %[[VEC]], %[[VAL]]
 //       CHECK-PROP:   vector.insert %[[W]]#1, %[[W]]#0 [] : f32 into vector<f32>
 func.func @vector_insertelement_0d(%laneid: index) -> (vector<f32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<f32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<f32>) {
     %0 = "some_def"() : () -> (vector<f32>)
     %f = "another_def"() : () -> (f32)
     %1 = vector.insertelement %f, %0[] : vector<f32>
-    vector.yield %1 : vector<f32>
+    gpu.yield %1 : vector<f32>
   }
   return %r : vector<f32>
 }
@@ -1208,10 +1208,10 @@ func.func @vector_insertelement_0d(%laneid: index) -> (vector<f32>) {
 // CHECK-PROP-LABEL: func @vector_insert_1d(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
 //   CHECK-PROP-DAG:   %[[C26:.*]] = arith.constant 26 : index
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, f32)
 //       CHECK-PROP:     %[[VEC:.*]] = "some_def"
 //       CHECK-PROP:     %[[VAL:.*]] = "another_def"
-//       CHECK-PROP:     vector.yield %[[VEC]], %[[VAL]]
+//       CHECK-PROP:     gpu.yield %[[VEC]], %[[VAL]]
 //       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C26]]
 //       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<3xf32>) {
 //       CHECK-PROP:     %[[INSERT:.*]] = vector.insert %[[W]]#1, %[[W]]#0 [1]
@@ -1221,11 +1221,11 @@ func.func @vector_insertelement_0d(%laneid: index) -> (vector<f32>) {
 //       CHECK-PROP:   }
 //       CHECK-PROP:   return %[[R]]
 func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<3xf32>) {
     %0 = "some_def"() : () -> (vector<96xf32>)
     %f = "another_def"() : () -> (f32)
     %1 = vector.insert %f, %0[76] : f32 into vector<96xf32>
-    vector.yield %1 : vector<96xf32>
+    gpu.yield %1 : vector<96xf32>
   }
   return %r : vector<3xf32>
 }
@@ -1234,18 +1234,18 @@ func.func @vector_insert_1d(%laneid: index) -> (vector<3xf32>) {
 
 // CHECK-PROP-LABEL: func @vector_insert_2d_distr_src(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<3xf32>, vector<4x3xf32>)
 //       CHECK-PROP:     %[[VEC:.*]] = "some_def"
 //       CHECK-PROP:     %[[VAL:.*]] = "another_def"
-//       CHECK-PROP:     vector.yield %[[VAL]], %[[VEC]]
+//       CHECK-PROP:     gpu.yield %[[VAL]], %[[VEC]]
 //       CHECK-PROP:   %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<3xf32> into vector<4x3xf32>
 //       CHECK-PROP:   return %[[INSERT]]
 func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x3xf32>) {
     %0 = "some_def"() : () -> (vector<4x96xf32>)
     %s = "another_def"() : () -> (vector<96xf32>)
     %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32>
-    vector.yield %1 : vector<4x96xf32>
+    gpu.yield %1 : vector<4x96xf32>
   }
   return %r : vector<4x3xf32>
 }
@@ -1255,10 +1255,10 @@ func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) {
 // CHECK-PROP-LABEL: func @vector_insert_2d_distr_pos(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
 //       CHECK-PROP:   %[[C19:.*]] = arith.constant 19 : index
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
 //       CHECK-PROP:     %[[VEC:.*]] = "some_def"
 //       CHECK-PROP:     %[[VAL:.*]] = "another_def"
-//       CHECK-PROP:     vector.yield %[[VAL]], %[[VEC]]
+//       CHECK-PROP:     gpu.yield %[[VAL]], %[[VEC]]
 //       CHECK-PROP:   %[[SHOULD_INSERT:.*]] = arith.cmpi eq, %[[LANEID]], %[[C19]]
 //       CHECK-PROP:   %[[R:.*]] = scf.if %[[SHOULD_INSERT]] -> (vector<4x96xf32>) {
 //       CHECK-PROP:     %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [3] : vector<96xf32> into vector<4x96xf32>
@@ -1268,11 +1268,11 @@ func.func @vector_insert_2d_distr_src(%laneid: index) -> (vector<4x3xf32>) {
 //       CHECK-PROP:   }
 //       CHECK-PROP:   return %[[R]]
 func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
     %0 = "some_def"() : () -> (vector<128x96xf32>)
     %s = "another_def"() : () -> (vector<96xf32>)
     %1 = vector.insert %s, %0[79] : vector<96xf32> into vector<128x96xf32>
-    vector.yield %1 : vector<128x96xf32>
+    gpu.yield %1 : vector<128x96xf32>
   }
   return %r : vector<4x96xf32>
 }
@@ -1281,18 +1281,18 @@ func.func @vector_insert_2d_distr_pos(%laneid: index) -> (vector<4x96xf32>) {
 
 // CHECK-PROP-LABEL: func @vector_insert_2d_broadcast(
 //  CHECK-PROP-SAME:     %[[LANEID:.*]]: index
-//       CHECK-PROP:   %[[W:.*]]:2 = vector.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
+//       CHECK-PROP:   %[[W:.*]]:2 = gpu.warp_execute_on_lane_0{{.*}} -> (vector<96xf32>, vector<4x96xf32>)
 //       CHECK-PROP:     %[[VEC:.*]] = "some_def"
 //       CHECK-PROP:     %[[VAL:.*]] = "another_def"
-//       CHECK-PROP:     vector.yield %[[VAL]], %[[VEC]]
+//       CHECK-PROP:     gpu.yield %[[VAL]], %[[VEC]]
 //       CHECK-PROP:   %[[INSERT:.*]] = vector.insert %[[W]]#0, %[[W]]#1 [2] : vector<96xf32> into vector<4x96xf32>
 //       CHECK-PROP:   return %[[INSERT]]
 func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<4x96xf32>) {
     %0 = "some_def"() : () -> (vector<4x96xf32>)
     %s = "another_def"() : () -> (vector<96xf32>)
     %1 = vector.insert %s, %0[2] : vector<96xf32> into vector<4x96xf32>
-    vector.yield %1 : vector<4x96xf32>
+    gpu.yield %1 : vector<4x96xf32>
   }
   return %r : vector<4x96xf32>
 }
@@ -1310,12 +1310,12 @@ func.func @vector_insert_2d_broadcast(%laneid: index) -> (vector<4x96xf32>) {
 //  CHECK-PROP-SAME:     %[[AR2:[^ :]*]]: memref<1x4x1024xf32>)
 //   CHECK-PROP-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-PROP-DAG:   %[[THREADID:.*]] = gpu.thread_id  x
-//       CHECK-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]]
+//       CHECK-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%[[THREADID]])[32] args(%[[IN2]]
 //       CHECK-PROP:     %[[GATHER:.*]] = vector.gather %[[AR1]][{{.*}}]
 //       CHECK-PROP:     %[[EXTRACT:.*]] = vector.extract %[[GATHER]][0] : vector<64xi32> from vector<1x64xi32>
 //       CHECK-PROP:     %[[CAST:.*]] = arith.index_cast %[[EXTRACT]] : vector<64xi32> to vector<64xindex>
 //       CHECK-PROP:     %[[EXTRACTELT:.*]] = vector.extract %[[CAST]][{{.*}}] : index from vector<64xindex>
-//       CHECK-PROP:     vector.yield %[[EXTRACTELT]] : index
+//       CHECK-PROP:     gpu.yield %[[EXTRACTELT]] : index
 //       CHECK-PROP:   %[[APPLY:.*]] = affine.apply #[[$MAP]]()[%[[THREADID]]]
 //       CHECK-PROP:   %[[TRANSFERREAD:.*]] = vector.transfer_read %[[AR2]][%[[C0]], %[[W]], %[[APPLY]]],
 //       CHECK-PROP:   return %[[TRANSFERREAD]]
@@ -1329,14 +1329,14 @@ func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 :  memref<1
   %cst_2 = arith.constant dense<0> : vector<64xindex>
   %cst_6 = arith.constant 0.000000e+00 : f32
 
-  %18 = vector.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) {
+  %18 = gpu.warp_execute_on_lane_0(%0)[32] args(%in2 : vector<1x2xindex>) -> (vector<2xf32>) {
   ^bb0(%arg4: vector<1x64xindex>):
     %28 = vector.gather %ar1[%c0, %c0, %c0] [%arg4], %cst_0, %cst : memref<1x4x2xi32>, vector<1x64xindex>, vector<1x64xi1>, vector<1x64xi32> into vector<1x64xi32>
     %29 = vector.extract %28[0] : vector<64xi32> from vector<1x64xi32>
     %30 = arith.index_cast %29 : vector<64xi32> to vector<64xindex>
     %36 = vector.extractelement %30[%c0_i32 : index] : vector<64xindex>
     %37 = vector.transfer_read %ar2[%c0, %36, %c0], %cst_6 {in_bounds = [true]} : memref<1x4x1024xf32>, vector<64xf32>
-    vector.yield %37 : vector<64xf32>
+    gpu.yield %37 : vector<64xf32>
   }
   return %18 : vector<2xf32>
 }
@@ -1347,16 +1347,16 @@ func.func @transfer_read_prop_operands(%in2: vector<1x2xindex>, %ar1 :  memref<1
 // same value.
 
 // CHECK-PROP-LABEL: func @dont_fold_vector_broadcast(
-//       CHECK-PROP:   %[[r:.*]] = vector.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>)
+//       CHECK-PROP:   %[[r:.*]] = gpu.warp_execute_on_lane_0{{.*}} -> (vector<1x2xf32>)
 //       CHECK-PROP:     %[[some_def:.*]] = "some_def"
 //       CHECK-PROP:     %[[broadcast:.*]] = vector.broadcast %[[some_def]] : vector<64xf32> to vector<1x64xf32>
-//       CHECK-PROP:     vector.yield %[[broadcast]] : vector<1x64xf32>
+//       CHECK-PROP:     gpu.yield %[[broadcast]] : vector<1x64xf32>
 //       CHECK-PROP:   vector.print %[[r]] : vector<1x2xf32>
 func.func @dont_fold_vector_broadcast(%laneid: index) {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2xf32>) {
     %0 = "some_def"() : () -> (vector<64xf32>)
     %1 = vector.broadcast %0 : vector<64xf32> to vector<1x64xf32>
-    vector.yield %1 : vector<1x64xf32>
+    gpu.yield %1 : vector<1x64xf32>
   }
   vector.print %r : vector<1x2xf32>
   return
@@ -1367,10 +1367,10 @@ func.func @dont_fold_vector_broadcast(%laneid: index) {
 func.func @warp_propagate_shape_cast(%laneid: index, %src: memref<32x4x32xf32>) -> vector<4xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.000000e+00 : f32
-  %r = vector.warp_execute_on_lane_0(%laneid)[1024] -> (vector<4xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[1024] -> (vector<4xf32>) {
     %2 = vector.transfer_read %src[%c0, %c0, %c0], %cst : memref<32x4x32xf32>, vector<32x4x32xf32>
     %3 = vector.shape_cast %2 : vector<32x4x32xf32> to vector<4096xf32>
-    vector.yield %3 : vector<4096xf32>
+    gpu.yield %3 : vector<4096xf32>
   }
   return %r : vector<4xf32>
 }
@@ -1384,9 +1384,9 @@ func.func @warp_propagate_shape_cast(%laneid: index, %src: memref<32x4x32xf32>)
 
 func.func @warp_propagate_uniform_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index) -> vector<1xf32> {
   %f0 = arith.constant 0.000000e+00 : f32
-  %r = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) {
     %1 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
-    vector.yield %1 : vector<1xf32>
+    gpu.yield %1 : vector<1xf32>
   }
   return %r : vector<1xf32>
 }
@@ -1400,31 +1400,31 @@ func.func @warp_propagate_uniform_transfer_read(%laneid: index, %src: memref<409
 
 func.func @warp_propagate_multi_transfer_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>, vector<1xf32>) {
   %f0 = arith.constant 0.000000e+00 : f32
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>, vector<1xf32>) {
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>, vector<1xf32>) {
     %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
     "some_use"(%0) : (vector<1xf32>) -> ()
     %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<1xf32>
-    vector.yield %0, %1 : vector<1xf32>, vector<1xf32>
+    gpu.yield %0, %1 : vector<1xf32>, vector<1xf32>
   }
   return %r#0, %r#1 : vector<1xf32>, vector<1xf32>
 }
 
 // CHECK-PROP-LABEL: func.func @warp_propagate_multi_transfer_read
-//       CHECK-PROP:   vector.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
+//       CHECK-PROP:   gpu.warp_execute_on_lane_0{{.*}} -> (vector<1xf32>)
 //       CHECK-PROP:     %[[INNER_READ:.+]] = vector.transfer_read
 //       CHECK-PROP:     "some_use"(%[[INNER_READ]])
-//       CHECK-PROP:     vector.yield %[[INNER_READ]] : vector<1xf32>
+//       CHECK-PROP:     gpu.yield %[[INNER_READ]] : vector<1xf32>
 //       CHECK-PROP:   vector.transfer_read
 
 // -----
 
 func.func @warp_propagate_dead_user_multi_read(%laneid: index, %src: memref<4096xf32>, %index: index, %index1: index) -> (vector<1xf32>) {
   %f0 = arith.constant 0.000000e+00 : f32
-  %r = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<1xf32>) {
     %0 = vector.transfer_read %src[%index], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32>
     %1 = vector.transfer_read %src[%index1], %f0 {in_bounds = [true]} : memref<4096xf32>, vector<64xf32>
     %max = arith.maximumf %0, %1 : vector<64xf32>
-    vector.yield %max : vector<64xf32>
+    gpu.yield %max : vector<64xf32>
   }
   return %r : vector<1xf32>
 }
@@ -1437,25 +1437,25 @@ func.func @warp_propagate_dead_user_multi_read(%laneid: index, %src: memref<4096
 
 func.func @warp_propagate_masked_write(%laneid: index, %dest: memref<4096xf32>) {
   %c0 = arith.constant 0 : index
-  vector.warp_execute_on_lane_0(%laneid)[32] -> () {
+  gpu.warp_execute_on_lane_0(%laneid)[32] -> () {
     %mask = "mask_def_0"() : () -> (vector<4096xi1>)
     %mask2 = "mask_def_1"() : () -> (vector<32xi1>)
     %0 = "some_def_0"() : () -> (vector<4096xf32>)
     %1 = "some_def_1"() : () -> (vector<32xf32>)
     vector.transfer_write %0, %dest[%c0], %mask : vector<4096xf32>, memref<4096xf32>
     vector.transfer_write %1, %dest[%c0], %mask2 : vector<32xf32>, memref<4096xf32>
-    vector.yield
+    gpu.yield
   }
   return
 }
 
 // CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_masked_write(
-//       CHECK-DIST-AND-PROP:   %[[W:.*]]:4 = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xi1>, vector<128xf32>, vector<128xi1>) {
+//       CHECK-DIST-AND-PROP:   %[[W:.*]]:4 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1xf32>, vector<1xi1>, vector<128xf32>, vector<128xi1>) {
 //       CHECK-DIST-AND-PROP:     %[[M0:.*]] = "mask_def_0"
 //       CHECK-DIST-AND-PROP:     %[[M1:.*]] = "mask_def_1"
 //       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def_0"
 //       CHECK-DIST-AND-PROP:     %[[V1:.*]] = "some_def_1"
-//       CHECK-DIST-AND-PROP:     vector.yield %[[V1]], %[[M1]], %[[V0]], %[[M0]]
+//       CHECK-DIST-AND-PROP:     gpu.yield %[[V1]], %[[M1]], %[[V0]], %[[M0]]
 //  CHECK-DIST-AND-PROP-SAME:       vector<32xf32>, vector<32xi1>, vector<4096xf32>, vector<4096xi1>
 //       CHECK-DIST-AND-PROP:   }
 //       CHECK-DIST-AND-PROP:   vector.transfer_write %[[W]]#2, {{.*}}, %[[W]]#3 {in_bounds = [true]} : vector<128xf32>, memref<4096xf32>
@@ -1466,12 +1466,12 @@ func.func @warp_propagate_masked_write(%laneid: index, %dest: memref<4096xf32>)
 func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> (vector<2xf32>, vector<2x2xf32>) {
   %f0 = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2x2xf32>) {
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2x2xf32>) {
     %mask = "mask_def_0"() : () -> (vector<128xi1>)
     %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32>
     %mask2 = "mask_def_1"() : () -> (vector<128x2xi1>)
     %1 = vector.transfer_read %src[%c0, %index], %f0, %mask2 {in_bounds = [true, true]} : memref<4096x4096xf32>, vector<128x2xf32>
-    vector.yield %0, %1 : vector<128xf32>, vector<128x2xf32>
+    gpu.yield %0, %1 : vector<128xf32>, vector<128x2xf32>
   }
   return %r#0, %r#1 : vector<2xf32>, vector<2x2xf32>
 }
@@ -1481,10 +1481,10 @@ func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096
 // CHECK-PROP-LABEL: func.func @warp_propagate_masked_transfer_read
 //  CHECK-PROP-SAME:   %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index
 //       CHECK-PROP:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK-PROP:   %[[R:.*]]:2 = vector.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>, vector<2x2xi1>) {
+//       CHECK-PROP:   %[[R:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>, vector<2x2xi1>) {
 //       CHECK-PROP:     %[[M0:.*]] = "mask_def_0"
 //       CHECK-PROP:     %[[M1:.*]] = "mask_def_1"
-//       CHECK-PROP:     vector.yield %[[M0]], %[[M1]] : vector<128xi1>, vector<128x2xi1>
+//       CHECK-PROP:     gpu.yield %[[M0]], %[[M1]] : vector<128xi1>, vector<128x2xi1>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG0]]]
 //       CHECK-PROP:   vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[ARG2]]], {{.*}}, %[[R]]#1 {{.*}} vector<2x2xf32>
@@ -1496,10 +1496,10 @@ func.func @warp_propagate_masked_transfer_read(%laneid: index, %src: memref<4096
 func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %src: memref<4096x4096xf32>, %index: index) -> vector<2xf32> {
   %f0 = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %r = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>) {
     %mask = "mask_def_0"() : () -> (vector<128xi1>)
     %0 = vector.transfer_read %src[%index, %c0], %f0, %mask {in_bounds = [true], permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<4096x4096xf32>, vector<128xf32>
-    vector.yield %0 : vector<128xf32>
+    gpu.yield %0 : vector<128xf32>
   }
   return %r : vector<2xf32>
 }
@@ -1509,9 +1509,9 @@ func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %s
 // CHECK-PROP-LABEL: func.func @warp_propagate_nontrivial_map_masked_transfer_read
 //  CHECK-PROP-SAME:   %[[ARG0:.+]]: index, {{.*}}, %[[ARG2:.+]]: index
 //       CHECK-PROP:   %[[C0:.*]] = arith.constant 0 : index
-//       CHECK-PROP:   %[[R:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>) {
+//       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[64] -> (vector<2xi1>) {
 //       CHECK-PROP:     %[[M0:.*]] = "mask_def_0"
-//       CHECK-PROP:     vector.yield %[[M0]] : vector<128xi1>
+//       CHECK-PROP:     gpu.yield %[[M0]] : vector<128xi1>
 //       CHECK-PROP:   }
 //       CHECK-PROP:   %[[DIST_READ_IDX0:.+]] = affine.apply #[[$MAP0]]()[%[[ARG2]], %[[ARG0]]]
 //       CHECK-PROP:   vector.transfer_read {{.*}}[%[[DIST_READ_IDX0]], %[[C0]]], {{.*}}, %[[R]]
@@ -1522,11 +1522,11 @@ func.func @warp_propagate_nontrivial_map_masked_transfer_read(%laneid: index, %s
 func.func @warp_propagate_masked_transfer_read_shared_mask(%laneid: index, %src: memref<4096x4096xf32>, %index: index, %index2: index, %mask_ub: index) -> (vector<2xf32>, vector<2xf32>) {
   %f0 = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2xf32>) {
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[64] -> (vector<2xf32>, vector<2xf32>) {
     %mask = vector.create_mask %mask_ub: vector<128xi1>
     %0 = vector.transfer_read %src[%c0, %index], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32>
     %1 = vector.transfer_read %src[%c0, %index2], %f0, %mask {in_bounds = [true]} : memref<4096x4096xf32>, vector<128xf32>
-    vector.yield %0, %1 : vector<128xf32>, vector<128xf32>
+    gpu.yield %0, %1 : vector<128xf32>, vector<128xf32>
   }
   return %r#0, %r#1 : vector<2xf32>, vector<2xf32>
 }
@@ -1542,12 +1542,12 @@ func.func @warp_propagate_masked_transfer_read_shared_mask(%laneid: index, %src:
 func.func @warp_propagate_unconnected_read_write(%laneid: index, %buffer: memref<128xf32>, %f1: f32) -> (vector<2xf32>, vector<4xf32>) {
   %f0 = arith.constant 0.000000e+00 : f32
   %c0 = arith.constant 0 : index
-  %r:2 = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>, vector<4xf32>) {
+  %r:2 = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<2xf32>, vector<4xf32>) {
     %cst = arith.constant dense<2.0> : vector<128xf32>
     %0 = vector.transfer_read %buffer[%c0], %f0 {in_bounds = [true]} : memref<128xf32>, vector<128xf32>
     vector.transfer_write %cst, %buffer[%c0] : vector<128xf32>, memref<128xf32>
     %1 = vector.broadcast %f1 : f32 to vector<64xf32>
-    vector.yield %1, %0 : vector<64xf32>, vector<128xf32>
+    gpu.yield %1, %0 : vector<64xf32>, vector<128xf32>
   }
   return %r#0, %r#1 : vector<2xf32>, vector<4xf32>
 }
@@ -1561,9 +1561,9 @@ func.func @warp_propagate_unconnected_read_write(%laneid: index, %buffer: memref
 // -----
 
 func.func @warp_propagate_create_mask(%laneid: index, %m0: index) -> vector<1xi1> {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xi1>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1xi1>) {
     %1 = vector.create_mask %m0 : vector<32xi1>
-    vector.yield %1 : vector<32xi1>
+    gpu.yield %1 : vector<32xi1>
   }
   return %r : vector<1xi1>
 }
@@ -1577,9 +1577,9 @@ func.func @warp_propagate_create_mask(%laneid: index, %m0: index) -> vector<1xi1
 // -----
 
 func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1: index, %m2: index) -> vector<1x2x4xi1> {
-  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2x4xi1>) {
+  %r = gpu.warp_execute_on_lane_0(%laneid)[32] -> (vector<1x2x4xi1>) {
     %1 = vector.create_mask %m0, %m1, %m2 : vector<16x4x4xi1>
-    vector.yield %1 : vector<16x4x4xi1>
+    gpu.yield %1 : vector<16x4x4xi1>
   }
   return %r : vector<1x2x4xi1>
 }
@@ -1596,10 +1596,10 @@ func.func @warp_propagate_multi_dim_create_mask(%laneid: index, %m0: index, %m1:
 
 func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
   %c0 = arith.constant 0 : index
-  vector.warp_execute_on_lane_0(%laneid)[32] -> () {
+  gpu.warp_execute_on_lane_0(%laneid)[32] -> () {
     %0 = "some_def"() : () -> (vector<4x1024xf32>)
     vector.transfer_write %0, %dest[%c0, %c0] : vector<4x1024xf32>, memref<4x1024xf32>
-    vector.yield
+    gpu.yield
   }
   return
 }
@@ -1607,9 +1607,9 @@ func.func @warp_propagate_nd_write(%laneid: index, %dest: memref<4x1024xf32>) {
 //       CHECK-DIST-AND-PROP: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 128)>
 
 // CHECK-DIST-AND-PROP-LABEL: func.func @warp_propagate_nd_write(
-//       CHECK-DIST-AND-PROP:   %[[W:.*]] = vector.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
+//       CHECK-DIST-AND-PROP:   %[[W:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<1x128xf32>) {
 //       CHECK-DIST-AND-PROP:     %[[V0:.*]] = "some_def"
-//       CHECK-DIST-AND-PROP:     vector.yield %[[V0]]
+//       CHECK-DIST-AND-PROP:     gpu.yield %[[V0]]
 //  CHECK-DIST-AND-PROP-SAME:       vector<4x1024xf32>
 //       CHECK-DIST-AND-PROP:   }
 
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
index 378e5b39415b5c..f1abf77753b871 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
@@ -20,7 +20,7 @@ func.func @gpu_func(%in: memref<1024xf32>, %out: memref<1xf32>) {
   gpu.launch blocks(%arg3, %arg4, %arg5)
   in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
   threads(%arg6, %arg7, %arg8) in (%arg12 = %c32, %arg13 = %c1, %arg14 = %c1) {
-    vector.warp_execute_on_lane_0(%arg6)[32] {
+    gpu.warp_execute_on_lane_0(%arg6)[32] {
       %init = vector.transfer_read %out[%c0], %cst_0 {in_bounds = [true]} : memref<1xf32>, vector<1xf32>
       %13 = scf.for %arg0 = %c0 to %c1024 step %c32 iter_args(%arg1 = %init) -> (vector<1xf32>) {
         %20 = vector.transfer_read %in[%arg0], %cst_0 {in_bounds = [true]} : memref<1024xf32>, vector<32xf32>
diff --git a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
index 7e9234901ffa1a..8ce24bfe3640ab 100644
--- a/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
+++ b/mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-warp-distribute.mlir
@@ -38,7 +38,7 @@ func.func @gpu_func(%arg1: memref<32xf32>, %arg2: memref<32xf32>) {
   gpu.launch blocks(%arg3, %arg4, %arg5)
   in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
   threads(%arg6, %arg7, %arg8) in (%arg12 = %c32, %arg13 = %c1, %arg14 = %c1) {
-    vector.warp_execute_on_lane_0(%arg6)[32] {
+    gpu.warp_execute_on_lane_0(%arg6)[32] {
       %0 = vector.transfer_read %arg1[%c0], %cst {in_bounds = [true]} : memref<32xf32>, vector<32xf32>
       %1 = vector.transfer_read %arg2[%c0], %cst {in_bound = [true]} : memref<32xf32>, vector<32xf32>
       %2 = arith.addf %0, %1 : vector<32xf32>
diff --git a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
index 72aaa7dc4f8973..9d8969edfd90fe 100644
--- a/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -519,7 +519,7 @@ struct TestVectorScanLowering
 /// Allocate shared memory for a single warp to test lowering of
 /// WarpExecuteOnLane0Op.
 static Value allocateGlobalSharedMemory(Location loc, OpBuilder &builder,
-                                        WarpExecuteOnLane0Op warpOp,
+                                        gpu::WarpExecuteOnLane0Op warpOp,
                                         Type type) {
   static constexpr int64_t kSharedMemorySpace = 3;
   // Compute type of shared memory buffer.
@@ -583,8 +583,9 @@ struct TestVectorDistribution
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestVectorDistribution)
 
   void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<scf::SCFDialect, memref::MemRefDialect, gpu::GPUDialect,
-                    affine::AffineDialect>();
+    registry
+        .insert<vector::VectorDialect, scf::SCFDialect, memref::MemRefDialect,
+                gpu::GPUDialect, affine::AffineDialect>();
   }
 
   StringRef getArgument() const final { return "test-vector-warp-distribute"; }
@@ -622,7 +623,7 @@ struct TestVectorDistribution
     RewritePatternSet patterns(&getContext());
 
     getOperation().walk([&](Operation *op) {
-      if (auto warpOp = dyn_cast<WarpExecuteOnLane0Op>(op)) {
+      if (auto warpOp = dyn_cast<gpu::WarpExecuteOnLane0Op>(op)) {
         if (hoistUniform) {
           moveScalarUniformCode(warpOp);
         }
@@ -677,7 +678,7 @@ struct TestVectorDistribution
     WarpExecuteOnLane0LoweringOptions options;
     options.warpAllocationFn = allocateGlobalSharedMemory;
     options.warpSyncronizationFn = [](Location loc, OpBuilder &builder,
-                                      WarpExecuteOnLane0Op warpOp) {
+                                      gpu::WarpExecuteOnLane0Op warpOp) {
       builder.create<gpu::BarrierOp>(loc);
     };
     // Test on one pattern in isolation.