[Mlir-commits] [mlir] eb2f946 - [mlir][scf] Rename ForeachThreadOp->ForallOp, PerformConcurrentlyOp->InParallelOp.

Fri Feb 17 01:02:02 PST 2023

Author: Alexander Belyaev
Date: 2023-02-17T09:59:39+01:00
New Revision: eb2f946e780cc0e82260c775930fa0509885b1ea

URL: https://github.com/llvm/llvm-project/commit/eb2f946e780cc0e82260c775930fa0509885b1ea
DIFF: https://github.com/llvm/llvm-project/commit/eb2f946e780cc0e82260c775930fa0509885b1ea.diff

LOG: [mlir][scf] Rename ForeachThreadOp->ForallOp, PerformConcurrentlyOp->InParallelOp.

Differential Revision: https://reviews.llvm.org/D144242

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
    mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
    mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
    mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
    mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
    mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
    mlir/include/mlir/Dialect/SCF/IR/SCF.h
    mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
    mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
    mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
    mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
    mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
    mlir/lib/Dialect/SCF/IR/SCF.cpp
    mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
    mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
    mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
    mlir/test/Dialect/GPU/transform-gpu-failing.mlir
    mlir/test/Dialect/GPU/transform-gpu.mlir
    mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
    mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
    mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
    mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
    mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
    mlir/test/Dialect/SCF/canonicalize.mlir
    mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
    mlir/test/Dialect/SCF/invalid.mlir
    mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
    mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
    mlir/test/Dialect/SCF/one-shot-bufferize.mlir
    mlir/test/Dialect/SCF/ops.mlir
    mlir/test/Dialect/Tensor/canonicalize.mlir
    mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
    mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
    mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
    mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
    mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
    mlir/test/python/dialects/transform_structured_ext.py

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
index 9b6485523c1c9..91352b28fc7b7 100644

--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.h
@@ -33,40 +33,37 @@ class DialectRegistry;
 namespace transform {
 namespace gpu {
 
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// Searches `scf.forall` ops nested under `target` and maps each such
 /// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_apping attribute. Sibling `scf.foreach_thread` are supported in
+/// `scf.forall` are rewritten to gpu.thread_id according to the
+/// thread_dim_apping attribute. Sibling `scf.forall` are supported in
 /// which case, the union of the number of threads is computed and may result in
-/// predication. Dynamic, `scf.foreach_thread` trip counts are currently not
+/// predication. Dynamic, `scf.forall` trip counts are currently not
 /// supported. Dynamic block dim sizes are currently not supported.
 DiagnosedSilenceableFailure mapNestedForeachToThreadsImpl(
     RewriterBase &rewriter, Operation *target,
     const SmallVectorImpl<int64_t> &blockDim,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         threadIdGenerator,
     bool syncAfterDistribute, std::optional<TransformOpInterface> transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes);
 
-/// Maps the top level `scf.foreach_thread` op to GPU Thread Blocks. Mapping is
-/// one-to-one and the induction variables of `scf.foreach_thread` are rewritten
+/// Maps the top level `scf.forall` op to GPU Thread Blocks. Mapping is
+/// one-to-one and the induction variables of `scf.forall` are rewritten
 /// to gpu.block_id according to the thread_dim_apping attribute. Dynamic,
-/// `scf.foreach_thread` trip counts are currently not supported. Dynamic block
+/// `scf.forall` trip counts are currently not supported. Dynamic block
 /// dim sizes are currently not supported.
 DiagnosedSilenceableFailure mapForeachToBlocksImpl(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    RewriterBase &rewriter, scf::ForallOp forallOp,
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         blockIdGenerator,
     SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes);
 
-/// Finds the top level scf::ForeachThreadOp of given target.
+/// Finds the top level scf::ForallOp of given target.
 DiagnosedSilenceableFailure
-findTopLevelForeachThreadOp(Operation *target,
-                            scf::ForeachThreadOp &topLevelForeachThreadOp,
-                            TransformOpInterface transformOp);
+findTopLevelForallOp(Operation *target, scf::ForallOp &topLevelForallOp,
+                     TransformOpInterface transformOp);
 
 } // namespace gpu
 } // namespace transform

diff  --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
index 2ee13548c9171..76292b4683380 100644
--- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
+++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td
@@ -22,30 +22,30 @@ def MapNestedForeachToThreads :
      TransformEachOpTrait,
      TransformOpInterface]> {
   let description = [{
-      Target the `gpu.launch op` and rewrite all `scf.foreach_thread`
+      Target the `gpu.launch op` and rewrite all `scf.forall`
       nested in it to distributed `gpu.thread_id` attribute.
 
-      The operation searches for `scf.foreach_thread` ops nested under `target`
+      The operation searches for `scf.forall` ops nested under `target`
       and maps each such op to GPU threads. Mapping is one-to-one and the
-      induction variables of `scf.foreach_thread` are rewritten to
+      induction variables of `scf.forall` are rewritten to
       `gpu.thread_id` according to the `mapping` attribute.
 
-      Sibling `scf.foreach_thread` are supported in which case, the union of
+      Sibling `scf.forall` are supported in which case, the union of
       the number of threads is computed and may result in predication.
 
-      Multiple scf.foreach_thread are supported per `gpu.launch` in which case,
+      Multiple scf.forall are supported per `gpu.launch` in which case,
       the max of all the threads is computed and taken for the global
-      `gpu.thread_id`. If necessary, `scf.foreach_thread` that do not use the
+      `gpu.thread_id`. If necessary, `scf.forall` that do not use the
       whole thread range result in predicated computations.
 
-      Dynamic `scf.foreach_thread` trip counts are currently not supported.
+      Dynamic `scf.forall` trip counts are currently not supported.
       Dynamic block dim sizes are currently not supported.
 
-      Only **bufferized** `scf.foreach_thread` are currently supported.
-      Only `scf.foreach_thread` distributed to **at most 3 dimensions** are
+      Only **bufferized** `scf.forall` are currently supported.
+      Only `scf.forall` distributed to **at most 3 dimensions** are
       currently supported.
 
-      Barriers are inserted after each scf.foreach_thread op for now.
+      Barriers are inserted after each scf.forall op for now.
 
       The operation alters the block size of the given gpu_launch using
       blockDim argument.
@@ -54,15 +54,15 @@ def MapNestedForeachToThreads :
 
       This operation ignores non-gpu_launch ops and drops them in the return.
 
-      If any scf.foreach_thread with tensors is found, the transform definitely
+      If any scf.forall with tensors is found, the transform definitely
       fails.
 
-      If all the scf.foreach_thread operations with gpu.thread mapping contained
+      If all the scf.forall operations with gpu.thread mapping contained
       within the LaunchOp referred to by the `target` PDLOperation lower to GPU
       properly, the transform succeeds. Otherwise the transform definitely
       fails.
 
-      scf.foreach_thread operations with mappings other than gpu.thread are
+      scf.forall operations with mappings other than gpu.thread are
       ignored.
 
       The returned handle points to the same LaunchOp operand, consuming it and
@@ -74,10 +74,10 @@ def MapNestedForeachToThreads :
       ```
       gpu.launch blocks(%bx, %by, %bz) in (%x = %0, %y = %1, %z = %2)
                  threads(%tx, %ty, %tz) in (%tx = %3, %ty = %4, %tz = %5) {
-        scf.foreach_thread (%i, %j) in (7, 9) {
+        scf.forall (%i, %j) in (7, 9) {
           ... // body 1
         } {mapping = [#gpu.thread<x>, #gpu.thread<y>, #gpu.thread<z>]}
-        scf.foreach_thread (%i) in (12) {
+        scf.forall (%i) in (12) {
           ... // body 2
         } {mapping = [#gpu.thread<x>]}
         gpu.terminator
@@ -125,21 +125,21 @@ def MapForeachToBlocks :
      TransformOpInterface,
      TransformEachOpTrait]> {
   let description = [{
-    Target the gpu_launch op and rewrite the top level `scf.foreach_thread`
+    Target the gpu_launch op and rewrite the top level `scf.forall`
     to distributed gpu.block_id attribute. If `generate_gpu_launch` attribute
     is set, then first generates `gpu_launch` and moves the top level
-    `scf.foreach_thread` inside.
+    `scf.forall` inside.
 
-    The operation searches top level `scf.foreach_thread` ops under
+    The operation searches top level `scf.forall` ops under
     `gpu_launch` and maps each such op to GPU blocks. Mapping is
-    one-to-one and the induction variables of `scf.foreach_thread` are
+    one-to-one and the induction variables of `scf.forall` are
     rewritten to gpu.block_id according to the `thread_dim_mapping` attribute.
 
-    Dynamic, `scf.foreach_thread` trip counts are currently not supported.
+    Dynamic, `scf.forall` trip counts are currently not supported.
     Dynamic block dim sizes are currently not supported.
 
-    Only **bufferized** scf.foreach_thread are currently supported.
-    Only scf.foreach_thread distributed to **at most 3 dimensions** are
+    Only **bufferized** scf.forall are currently supported.
+    Only scf.forall distributed to **at most 3 dimensions** are
     currently supported.
 
     The operation alters the block size of the given gpu_launch using
@@ -149,10 +149,10 @@ def MapForeachToBlocks :
 
     This operation ignores non-gpu_launch ops and drops them in the return.
 
-    If any scf.foreach_thread with tensors is found, the transform definitely
+    If any scf.forall with tensors is found, the transform definitely
     fails.
 
-    If all the scf.foreach_thread operations contained within the LaunchOp
+    If all the scf.forall operations contained within the LaunchOp
     referred to by the `target` PDLOperation lower to GPU properly, the
     transform succeeds. Otherwise the transform definitely fails.
 

diff  --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
index f3e0f5618bf74..5473821e29271 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -78,8 +78,8 @@ FailureOr<GemmDimsForPacking> inferGemmDims(linalg::LinalgOp linalgOp);
 /// Return true if `linalgOp` contains an embedded gemm subcomputation.
 bool containsMostMinorGemm(linalg::LinalgOp linalgOp);
 
-/// Implementation of tiling operations using `scf.foreach_thread`.
-DiagnosedSilenceableFailure tileToForeachThreadOpImpl(
+/// Implementation of tiling operations using `scf.forall`.
+DiagnosedSilenceableFailure tileToForallOpImpl(
     RewriterBase &rewriter, transform::TransformState &state,
     TransformOpInterface transformOp, ArrayRef<Operation *> targets,
     ArrayRef<OpFoldResult> mixedNumThreads,

diff  --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index ff8f3e33703c2..6dc616bf9d951 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1197,7 +1197,7 @@ def TileReductionUsingScfOp : Op<Transform_Dialect, "structured.tile_reduction_u
     ```
   }];
 
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   let arguments = (ins PDL_Operation:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes);
   let results = (outs PDL_Operation:$for_op,
@@ -1225,31 +1225,31 @@ def TileReductionUsingScfOp : Op<Transform_Dialect, "structured.tile_reduction_u
 }
 
 //===----------------------------------------------------------------------===//
-// TileReductionUsingForeachThreadOp
+// TileReductionUsingForallOp
 //===----------------------------------------------------------------------===//
 
-def TileReductionUsingForeachThreadOp :
-  Op<Transform_Dialect, "structured.tile_reduction_using_foreach_thread",
+def TileReductionUsingForallOp :
+  Op<Transform_Dialect, "structured.tile_reduction_using_forall",
        [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
         TransformEachOpTrait, TransformOpInterface]> {
   let description = [{
-    Tile a PartialReductionOpInterface op to a tiled `scf.foreach_thread` doing
+    Tile a PartialReductionOpInterface op to a tiled `scf.forall` doing
     partial reduction.
 
     This transformation tiles the `target` along the reduction dimensions. It
     creates a tensor initialized with the identity value. Then it creates a
-    `scf.foreach_thread` loops with the number threads given by `num_threads`.
+    `scf.forall` loops with the number threads given by `num_threads`.
     The op is tiled op with a size equal to `floordiv(size, num_threads)`.
     All the partial reduction value is are parallel inserted to create a new
     tensor. After the loop a merge operation is created to do a final reduction
     with the partial reductions tensor.
     If an extra `tile_sizes` parameter is passed the tiles are cyclically
-    distributed on the threads of the `scf.foreach_threads` loop.
+    distributed on the threads of the `scf.foralls` loop.
 
     #### Return modes
 
     This 4 returned handles point to:
-      - the parent foreach_thread op,
+      - the parent forall op,
       - the fill op used to initialize the neutral element,
       - the parallel tiled op and
       - the result-combining op.
@@ -1274,7 +1274,7 @@ def TileReductionUsingForeachThreadOp :
     ```
       %0 = tensor.empty(%dim_1) : tensor<?x5xf32>
       %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<?x5xf32>) -> tensor<?x5xf32>
-      %2 = scf.foreach_thread (%arg2) in (%c5) shared_outs(%arg3 = %1) -> (tensor<?x5xf32>) {
+      %2 = scf.forall (%arg2) in (%c5) shared_outs(%arg3 = %1) -> (tensor<?x5xf32>) {
         %4 = affine.min #map(%arg2)[%dim_0]
         %5 = affine.max #map1(%4)
         %extracted_slice = tensor.extract_slice %arg3[0, %arg2] [%dim, 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
@@ -1286,7 +1286,7 @@ def TileReductionUsingForeachThreadOp :
           %9 = arith.addf %in, %out : f32
           linalg.yield %9 : f32
         } -> tensor<?xf32>
-        scf.foreach_thread.perform_concurrently {
+        scf.forall.in_parallel {
           tensor.parallel_insert_slice %7 into %arg3[0, %arg2] [%dim, 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
         }
       } {mapping = []}
@@ -1298,12 +1298,12 @@ def TileReductionUsingForeachThreadOp :
     ```
   }];
 
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   let arguments = (ins PDL_Operation:$target,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$num_threads,
                    DefaultValuedAttr<DenseI64ArrayAttr, "{}">:$tile_sizes,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
-  let results = (outs PDL_Operation:$foreach_thread_op,
+  let results = (outs PDL_Operation:$forall_op,
                       PDL_Operation:$fill_op,
                       PDL_Operation:$split_linalg_op,
                       PDL_Operation:$combining_linalg_op);
@@ -1412,16 +1412,16 @@ def TileOp : Op<Transform_Dialect, "structured.tile",
 }
 
 //===----------------------------------------------------------------------===//
-// TileToForeachThreadOp
+// TileToForallOp
 //===----------------------------------------------------------------------===//
 
-def TileToForeachThreadOp :
-    Op<Transform_Dialect, "structured.tile_to_foreach_thread_op",
+def TileToForallOp :
+    Op<Transform_Dialect, "structured.tile_to_forall_op",
       [AttrSizedOperandSegments,
        DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
        TransformOpInterface]> {
   let description = [{
-    Tile a TilingInterface op to a tiled `scf.foreach_thread`.
+    Tile a TilingInterface op to a tiled `scf.forall`.
 
     Tiling is applied by either specifying `num_threads` or `tile_size`. If
     `num_threads` is specified, then the tile size for each dimension `i` is
@@ -1438,7 +1438,7 @@ def TileToForeachThreadOp :
     e.g. in the Linalg case).
 
     If non-empty, the `mapping` is added as an attribute to the
-    resulting `scf.foreach_thread`.
+    resulting `scf.forall`.
 
     Note: `tile_sizes` and `num_threads` are variadic. Each tile size/number of
     threads can be an index attribute or a transform handle that is mapped to
@@ -1457,14 +1457,14 @@ def TileToForeachThreadOp :
     tiled operations, which can all be empty.
 
     These two returned handles point to:
-      - the new scf.foreach_thread op,
+      - the new scf.forall op,
       - the tiled op that implements TilingInterface.
 
     #### Example using `num_threads`
 
     ```
     %0 = pdl_match @match_matmul in %arg1
-    %3:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [10, 20]
+    %3:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20]
     ```
 
     #### Example using `tile_sizes`
@@ -1472,7 +1472,7 @@ def TileToForeachThreadOp :
     ```
     %0 = pdl_match @match_matmul in %arg1
     %sz = pdl_match @match_size_op in %arg1
-    %3:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [0, %sz, 20]
+    %3:2 = transform.structured.tile_to_forall_op %0 tile_sizes [0, %sz, 20]
     ```
   }];
 
@@ -1484,7 +1484,7 @@ def TileToForeachThreadOp :
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_num_threads,
                    DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:$static_tile_sizes,
                    OptionalAttr<DeviceMappingArrayAttr>:$mapping);
-  let results = (outs PDL_Operation:$foreach_thread_op,
+  let results = (outs PDL_Operation:$forall_op,
                       PDL_Operation:$tiled_op);
 
   let builders = [

diff  --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index aedef03b88fc7..c782cab1e9f94 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -507,40 +507,40 @@ FailureOr<StaticMultiSizeSpecification>
 computeStaticMultiTileSizes(LinalgOp op, unsigned dimension, int64_t targetSize,
                             int64_t divisor);
 
-/// Rewrite a TilingInterface `op` to a tiled `scf.foreach_thread`, applying
+/// Rewrite a TilingInterface `op` to a tiled `scf.forall`, applying
 /// tiling by `numThreads`.
 /// If non-empty, the `mapping` is added as an attribute to the
-/// resulting `scf.foreach_thread`.
+/// resulting `scf.forall`.
 /// Zero tile sizes indicate that the dimension is not tiled, and can be
 /// thought of as tiling by the full size of data. It is the user's
 /// responsibility to ensure that `numThreads` is a valid tiling specification
 /// (i.e. that only tiles parallel dimensions, e.g. in the Linalg case).
-struct ForeachThreadTilingResult {
+struct ForallTilingResult {
   Operation *tileOp;
   Operation *tiledOp;
 };
-FailureOr<ForeachThreadTilingResult>
-tileToForeachThreadOp(RewriterBase &builder, TilingInterface op,
-                      ArrayRef<OpFoldResult> numThreads,
-                      std::optional<ArrayAttr> mapping);
+FailureOr<ForallTilingResult> tileToForallOp(RewriterBase &builder,
+                                             TilingInterface op,
+                                             ArrayRef<OpFoldResult> numThreads,
+                                             std::optional<ArrayAttr> mapping);
 
-/// Same as `tileToForeachThreadOp`, but calculate the number of threads
+/// Same as `tileToForallOp`, but calculate the number of threads
 /// required using the given tileSizes.
-FailureOr<ForeachThreadTilingResult>
-tileToForeachThreadOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
-                                    ArrayRef<OpFoldResult> tileSizes,
-                                    std::optional<ArrayAttr> mapping);
+FailureOr<ForallTilingResult>
+tileToForallOpUsingTileSizes(RewriterBase &builder, TilingInterface op,
+                             ArrayRef<OpFoldResult> tileSizes,
+                             std::optional<ArrayAttr> mapping);
 
 /// Transformation information returned after reduction tiling.
-struct ForeachThreadReductionTilingResult {
+struct ForallReductionTilingResult {
   /// The partial reduction tiled op generated.
   Operation *parallelTiledOp;
   /// The final reduction operation merging all the partial reductions.
   Operation *mergeOp;
   /// The op initializing the tensor used for partial reductions.
   Operation *initialOp;
-  /// The `scf.foreach_thread` operation that iterate over the tiles.
-  scf::ForeachThreadOp loops;
+  /// The `scf.forall` operation that iterate over the tiles.
+  scf::ForallOp loops;
 };
 
 /// Method to tile a reduction to parallel iterations computing partial
@@ -556,7 +556,7 @@ struct ForeachThreadReductionTilingResult {
 ///
 /// ```mlir
 /// %0 = linalg.fill ... : tensor<7x4xf32>
-/// %1 = scf.foreach_thread (%iv) in (%c4) shared_outs(%arg0 = %0)
+/// %1 = scf.forall (%iv) in (%c4) shared_outs(%arg0 = %0)
 ///   -> (tensor<7x4xf32>) {
 ///   %2 = tensor.extract_slice %arg3 : tensor<7x4xf32> to tensor<7xf32>
 ///   %3 = tensor.extract_slice %in : tensor<7x9xf32> -> tensor<7x?xf32>
@@ -567,10 +567,11 @@ struct ForeachThreadReductionTilingResult {
 /// %6 = linalg.generic %1 ["parallel", "reduction"]
 ///   : tensor<7x4xf32> -> tensor<7xf32>
 /// ```
-FailureOr<ForeachThreadReductionTilingResult> tileReductionUsingForeachThread(
-    RewriterBase &b, PartialReductionOpInterface op,
-    ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes = {},
-    std::optional<ArrayAttr> mapping = std::nullopt);
+FailureOr<ForallReductionTilingResult>
+tileReductionUsingForall(RewriterBase &b, PartialReductionOpInterface op,
+                         ArrayRef<OpFoldResult> numThreads,
+                         ArrayRef<OpFoldResult> tileSizes = {},
+                         std::optional<ArrayAttr> mapping = std::nullopt);
 
 /// All indices returned by IndexOp should be invariant with respect to
 /// tiling. Therefore, if an operation is tiled, we have to transform the

diff  --git a/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td b/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
index e7504fd7523ae..8d07f791d3a8f 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/DeviceMappingInterface.td
@@ -30,7 +30,7 @@ def DeviceMappingAttrInterface : AttrInterface<"DeviceMappingAttrInterface"> {
     can be used by the device-specific code generators and the desired regions
     can be connected to the given processing unit.
 
-    Currently, `scf.foreach_thread` uses this interface to express the mapping
+    Currently, `scf.forall` uses this interface to express the mapping
     of the loops it contains to the GPU's parallelism units such as threads and
     thread blocks.
   }];

diff  --git a/mlir/include/mlir/Dialect/SCF/IR/SCF.h b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
index 5453f3862e744..9ae71bc73d02f 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCF.h
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCF.h
@@ -52,9 +52,9 @@ ForOp getForInductionVarOwner(Value val);
 /// value is not an induction variable, then return nullptr.
 ParallelOp getParallelForInductionVarOwner(Value val);
 
-/// Returns the ForeachThreadOp parent of an thread index variable.
+/// Returns the ForallOp parent of an thread index variable.
 /// If the provided value is not a thread index variable, then return nullptr.
-ForeachThreadOp getForeachThreadOpThreadIndexOwner(Value val);
+ForallOp getForallOpThreadIndexOwner(Value val);
 
 /// Return true if ops a and b (or their ancestors) are in mutually exclusive
 /// regions/blocks of an IfOp.

diff  --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 6d627635bfe8e..0da8558681c45 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -348,18 +348,18 @@ def ForOp : SCF_Op<"for",
 }
 
 //===----------------------------------------------------------------------===//
-// ForeachThreadOp
+// ForallOp
 //===----------------------------------------------------------------------===//
 
-def ForeachThreadOp : SCF_Op<"foreach_thread", [
+def ForallOp : SCF_Op<"forall", [
        AttrSizedOperandSegments,
        AutomaticAllocationScope,
        RecursiveMemoryEffects,
-       SingleBlockImplicitTerminator<"scf::PerformConcurrentlyOp">,
+       SingleBlockImplicitTerminator<"scf::InParallelOp">,
      ]> {
   let summary = "evaluate a block multiple times in parallel";
   let description = [{
-    `scf.foreach_thread` is a target-independent multi-dimensional parallel
+    `scf.forall` is a target-independent multi-dimensional parallel
     region application operation. It has exactly one block that represents the
     parallel body and it takes index operands that specify lower bounds, upper
     bounds and steps.
@@ -389,22 +389,22 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
     the op is lowered to, or to ignore it when the specification is ill-formed
     or unsupported for a particular target.
 
-    The only allowed terminator is `scf.foreach_thread.perform_concurrently`.
-    `scf.foreach_thread` returns one value per `shared_out` operand. The
-    actions of the `perform_concurrently` terminators specify how to combine the
+    The only allowed terminator is `scf.forall.in_parallel`.
+    `scf.forall` returns one value per `shared_out` operand. The
+    actions of the `in_parallel` terminators specify how to combine the
     partial results of all parallel invocations into a full value, in some
     unspecified order. The "destination" of each such op must be a `shared_out`
-    block argument of the `scf.foreach_thread` op.
+    block argument of the `scf.forall` op.
 
     The actions involved in constructing the return values are further described
     by `tensor.parallel_insert_slice`.
 
-    `scf.foreach_thread` acts as an implicit synchronization point.
+    `scf.forall` acts as an implicit synchronization point.
 
     When the parallel function body has side effects, their order is unspecified
     across threads.
 
-    `scf.foreach_thread` can be printed in two 
diff erent ways depending on
+    `scf.forall` can be printed in two 
diff erent ways depending on
     whether the loop is normalized or not. The loop is 'normalized' when all
     lower bounds are equal to zero and steps are equal to one. In that case,
     `lowerBound` and `step` operands will be omitted during printing.
@@ -415,7 +415,7 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
     //
     // Sequential context.
     //
-    %matmul_and_pointwise:2 = scf.foreach_thread (%thread_id_1, %thread_id_2) in
+    %matmul_and_pointwise:2 = scf.forall (%thread_id_1, %thread_id_2) in
         (%num_threads_1, %numthread_id_2) shared_outs(%o1 = %C, %o2 = %pointwise)
       -> (tensor<?x?xT>, tensor<?xT>) {
       //
@@ -434,11 +434,11 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
         tensor<?xT> to tensor<?xT>
       %sE = add ins(%spointwise) outs(%sD)
 
-      scf.foreach_thread.perform_concurrently {
-        scf.foreach_thread.parallel_insert_slice %sD into %o1[h((%thread_id_1, %thread_id_2))]:
+      scf.forall.in_parallel {
+        scf.forall.parallel_insert_slice %sD into %o1[h((%thread_id_1, %thread_id_2))]:
           tensor<?x?xT> into tensor<?x?xT>
 
-        scf.foreach_thread.parallel_insert_slice %spointwise into %o2[i((%thread_id_1, %thread_id_2))]:
+        scf.forall.parallel_insert_slice %spointwise into %o2[i((%thread_id_1, %thread_id_2))]:
           tensor<?xT> into tensor<?xT>
       }
     }
@@ -453,7 +453,7 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
     //
     // Sequential context.
     //
-    %pointwise = scf.foreach_thread (%i, %j) = (0, 0) to (%dim1, %dim2)
+    %pointwise = scf.forall (%i, %j) = (0, 0) to (%dim1, %dim2)
       step (%tileSize1, %tileSize2) shared_outs(%o1 = %out)
       -> (tensor<?x?xT>, tensor<?xT>) {
       //
@@ -468,8 +468,8 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
 
       %add = map {"arith.addf"} ins(%sA, %sB) outs(%sC)
 
-      scf.foreach_thread.perform_concurrently {
-        scf.foreach_thread.parallel_insert_slice %add into
+      scf.forall.in_parallel {
+        scf.forall.parallel_insert_slice %add into
           %o[%i, %j][%tileSize1, %tileSize2][1, 1]
           : tensor<?x?xT> into tensor<?x?xT>
       }
@@ -486,14 +486,14 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
     // Sequential context. Here `mapping` is expressed as GPU thread mapping
     // attributes
     //
-    %matmul_and_pointwise:2 = scf.foreach_thread (%thread_id_1, %thread_id_2) in
+    %matmul_and_pointwise:2 = scf.forall (%thread_id_1, %thread_id_2) in
         (%num_threads_1, %numthread_id_2) shared_outs(...)
       -> (tensor<?x?xT>, tensor<?xT>) {
       //
       // Parallel context, each thread with id = **(%thread_id_2, %thread_id_1)**
       // runs its version of the code.
       //
-       scf.foreach_thread.perform_concurrently {
+       scf.forall.in_parallel {
          ...
       }
     } { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
@@ -507,9 +507,9 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
     ```mlir
     %t0 = ...
     %t1 = ...
-    %r = scf.foreach_thread ... shared_outs(%o = t0) -> tensor<?xf32> {
+    %r = scf.forall ... shared_outs(%o = t0) -> tensor<?xf32> {
       // %t0 and %t1 are privatized. %t0 is definitely copied for each thread
-      // because the scf.foreach_thread op's %t0 use bufferizes to a memory
+      // because the scf.forall op's %t0 use bufferizes to a memory
       // write. In the absence of other conflicts, %t1 is copied only if there
       // are uses of %t1 in the body that bufferize to a memory read and to a
       // memory write.
@@ -661,28 +661,28 @@ def ForeachThreadOp : SCF_Op<"foreach_thread", [
     static void ensureTerminator(Region & region, OpBuilder & builder,
                                  Location loc);
 
-    PerformConcurrentlyOp getTerminator();
+    InParallelOp getTerminator();
   }];
 }
 
 //===----------------------------------------------------------------------===//
-// PerformConcurrentlyOp
+// InParallelOp
 //===----------------------------------------------------------------------===//
 
-def PerformConcurrentlyOp : SCF_Op<"foreach_thread.perform_concurrently", [
+def InParallelOp : SCF_Op<"forall.in_parallel", [
        Pure,
        Terminator,
        DeclareOpInterfaceMethods<ParallelCombiningOpInterface>,
-       HasParent<"ForeachThreadOp">,
+       HasParent<"ForallOp">,
       ] # GraphRegionNoTerminator.traits> {
-  let summary = "terminates a `foreach_thread` block";
+  let summary = "terminates a `forall` block";
   let description = [{
-    `scf.foreach_thread.perform_concurrently` is a designated terminator for
-    the `scf.foreach_thread` operation.
+    `scf.forall.in_parallel` is a designated terminator for
+    the `scf.forall` operation.
 
     It has a single region with a single block that contains a flat list of ops.
     Each such op participates in the aggregate formation of a single result of
-    the enclosing `scf.foreach_thread`.
+    the enclosing `scf.forall`.
     The result number corresponds to the position of the op in the terminator.
   }];
 
@@ -697,8 +697,8 @@ def PerformConcurrentlyOp : SCF_Op<"foreach_thread.perform_concurrently", [
     OpBuilder<(ins)>,
   ];
 
-  // TODO: Add a `PerformConcurrentlyOpInterface` interface for ops that can
-  // appear inside perform_concurrently.
+  // TODO: Add a `InParallelOpInterface` interface for ops that can
+  // appear inside in_parallel.
   let extraClassDeclaration = [{
     ::llvm::SmallVector<::mlir::BlockArgument> getDests();
     ::llvm::iterator_range<::mlir::Block::iterator> getYieldingOps();

diff  --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 77d183e7d6ec3..9652d7de5f7cd 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1325,7 +1325,7 @@ def Tensor_PadOp : Tensor_Op<"pad", [
 // ParallelInsertSliceOp
 //===----------------------------------------------------------------------===//
 
-// TODO: Implement PerformConcurrentlyOpInterface.
+// TODO: Implement InParallelOpInterface.
 def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
        AttrSizedOperandSegments,
        OffsetSizeAndStrideOpInterface,
@@ -1441,7 +1441,7 @@ def Tensor_ParallelInsertSliceOp : Tensor_Op<"parallel_insert_slice", [
     /// `strides` operands.
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
 
-    /// Return the OpResult of the enclosing ForeachThreadOp that is
+    /// Return the OpResult of the enclosing ForallOp that is
     /// corresponding to this ParallelInsertSliceOp.
     OpResult getTiedOpResult();
   }];

diff  --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
index e971b764c6f05..ffe5c2a67e05c 100644
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -169,39 +169,38 @@ alterGpuLaunch(TrivialPatternRewriter &rewriter, LaunchOp gpuLaunch,
 //===----------------------------------------------------------------------===//
 
 DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    RewriterBase &rewriter, scf::ForallOp forallOp,
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         blockIdGenerator,
     SmallVectorImpl<int64_t> &gridDims, TransformOpInterface transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &mappingAttributes) {
   // Step 0. Target-specific verifications. There is no good place to anchor
-  // those right now: the ForeachThreadOp is target-independent and the
-  // transform op does not apply to individual ForeachThreadOp.
-  Location loc = foreachThreadOp->getLoc();
+  // those right now: the ForallOp is target-independent and the
+  // transform op does not apply to individual ForallOp.
+  Location loc = forallOp->getLoc();
 
-  if (!foreachThreadOp.isNormalized())
+  if (!forallOp.isNormalized())
     return transformOp.emitSilenceableError()
            << "unsupported non-normalized loops";
-  if (foreachThreadOp.getNumResults() > 0)
+  if (forallOp.getNumResults() > 0)
     return transformOp.emitSilenceableError()
-           << "only bufferized scf.foreach_thread lowers to "
+           << "only bufferized scf.forall lowers to "
               "gpu.block_id";
-  if (foreachThreadOp.getRank() > 3)
+  if (forallOp.getRank() > 3)
     return transformOp.emitSilenceableError()
-           << "scf.foreach_thread with rank > 3 does not lower to "
+           << "scf.forall with rank > 3 does not lower to "
               "gpu.block_id";
-  if (llvm::any_of(foreachThreadOp.getMixedUpperBound(), [](OpFoldResult ofr) {
+  if (llvm::any_of(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
         return !getConstantIntValue(ofr).has_value();
       })) {
     return transformOp.emitSilenceableError()
            << "unsupported dynamic griddim size";
   }
   SmallVector<Attribute> blockMapping =
-      llvm::to_vector(foreachThreadOp.getMapping()->getValue());
+      llvm::to_vector(forallOp.getMapping()->getValue());
 
   // Step 1. Complete the blockMapping to a full mapping (with 1s) if necessary.
-  SmallVector<Value> numBlocks = foreachThreadOp.getUpperBound(rewriter);
+  SmallVector<Value> numBlocks = forallOp.getUpperBound(rewriter);
   // Ensure we have 3 block sizes, one for each id.
   Value one;
   for (auto attr : mappingAttributes) {
@@ -218,68 +217,68 @@ DiagnosedSilenceableFailure mlir::transform::gpu::mapForeachToBlocksImpl(
                         DeviceMappingAttrInterface b) -> bool {
     return a.getMappingId() < b.getMappingId();
   };
-  SmallVector<Value> gridDimValues = scf::ForeachThreadOp::getValuesSortedByKey(
-      blockMapping, numBlocks, comparator);
+  SmallVector<Value> gridDimValues =
+      scf::ForallOp::getValuesSortedByKey(blockMapping, numBlocks, comparator);
   for (Value v : gridDimValues)
     gridDims.push_back(v.getDefiningOp<arith::ConstantIndexOp>().value());
 
   // Step 3. Generate the blockIds using the provided generator and map the
   // induction variables to the newly created ops.
   SmallVector<Value> blockOps;
-  blockIdGenerator(rewriter, foreachThreadOp, blockOps);
+  blockIdGenerator(rewriter, forallOp, blockOps);
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
-       llvm::zip(foreachThreadOp.getInductionVars(), blockMapping)) {
+       llvm::zip(forallOp.getInductionVars(), blockMapping)) {
     bvm.map(blockIdx,
             blockOps[static_cast<int64_t>(
                 blockDim.cast<DeviceMappingAttrInterface>().getMappingId())]);
   }
 
-  // Step 4. Move the body of foreachThreadOp.
+  // Step 4. Move the body of forallOp.
   // Erase the terminator first, it will not be used since we are on buffers.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
-  Block *targetBlock = foreachThreadOp->getBlock();
-  Block::iterator insertionPoint = Block::iterator(foreachThreadOp);
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  rewriter.eraseOp(forallOp.getTerminator());
+  Block *targetBlock = forallOp->getBlock();
+  Block::iterator insertionPoint = Block::iterator(forallOp);
+  Block &sourceBlock = forallOp.getRegion().front();
   targetBlock->getOperations().splice(insertionPoint,
                                       sourceBlock.getOperations());
 
   // Step 5. RAUW thread indices to thread ops.
-  for (Value loopIndex : foreachThreadOp.getInductionVars()) {
+  for (Value loopIndex : forallOp.getInductionVars()) {
     Value blockIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, blockIdx);
   }
 
   // Step 6. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
+  rewriter.eraseOp(forallOp);
 
   return DiagnosedSilenceableFailure::success();
 }
 
-DiagnosedSilenceableFailure mlir::transform::gpu::findTopLevelForeachThreadOp(
-    Operation *target, scf::ForeachThreadOp &topLevelForeachThreadOp,
-    TransformOpInterface transformOp) {
-  auto walkResult = target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
-    if (foreachThreadOp->getParentOfType<scf::ForeachThreadOp>())
+DiagnosedSilenceableFailure
+mlir::transform::gpu::findTopLevelForallOp(Operation *target,
+                                           scf::ForallOp &topLevelForallOp,
+                                           TransformOpInterface transformOp) {
+  auto walkResult = target->walk([&](scf::ForallOp forallOp) {
+    if (forallOp->getParentOfType<scf::ForallOp>())
       return WalkResult::advance();
-    if (topLevelForeachThreadOp)
+    if (topLevelForallOp)
       // TODO: Handle multiple foreach if there is no dependences between them
       return WalkResult::interrupt();
-    topLevelForeachThreadOp = foreachThreadOp;
+    topLevelForallOp = forallOp;
     return WalkResult::advance();
   });
 
   if (walkResult.wasInterrupted())
     return transformOp.emitSilenceableError()
-           << "could not find a unique topLevel scf.foreach_thread";
+           << "could not find a unique topLevel scf.forall";
   return DiagnosedSilenceableFailure::success();
 }
 
 /// This is a helper that is only used in
-/// rewriteTopLevelForeachThreadToGpuBlocks. It generates GPU dialects
+/// rewriteTopLevelForallToGpuBlocks. It generates GPU dialects
 /// block_id.
-static void generateGpuBlockIds(RewriterBase &rewriter,
-                                scf::ForeachThreadOp foreachOp,
+static void generateGpuBlockIds(RewriterBase &rewriter, scf::ForallOp foreachOp,
                                 SmallVectorImpl<Value> &blockOps) {
   Location loc = foreachOp->getLoc();
   OpBuilder::InsertionGuard guard(rewriter);
@@ -308,19 +307,18 @@ transform::MapForeachToBlocks::applyToOne(Operation *target,
     return diag;
   }
 
-  scf::ForeachThreadOp topLevelForeachThreadOp;
-  DiagnosedSilenceableFailure diag =
-      mlir::transform::gpu::findTopLevelForeachThreadOp(
-          target, topLevelForeachThreadOp, transformOp);
+  scf::ForallOp topLevelForallOp;
+  DiagnosedSilenceableFailure diag = mlir::transform::gpu::findTopLevelForallOp(
+      target, topLevelForallOp, transformOp);
   if (!diag.succeeded()) {
     diag.attachNote(target->getLoc()) << "when applied to this payload op";
     return diag;
   }
 
   OpBuilder::InsertionGuard guard(rewriter);
-  rewriter.setInsertionPoint(topLevelForeachThreadOp);
+  rewriter.setInsertionPoint(topLevelForallOp);
 
-  // Generate gpu launch here and move the foreach_thread inside
+  // Generate gpu launch here and move the forall inside
   if (getGenerateGpuLaunch()) {
     DiagnosedSilenceableFailure diag =
         createGpuLaunch(rewriter, target->getLoc(), transformOp, gpuLaunch);
@@ -328,9 +326,9 @@ transform::MapForeachToBlocks::applyToOne(Operation *target,
       return diag;
     }
     rewriter.setInsertionPointToStart(&gpuLaunch.getBody().front());
-    Operation *newForeachThreadOp = rewriter.clone(*topLevelForeachThreadOp);
-    rewriter.eraseOp(topLevelForeachThreadOp);
-    topLevelForeachThreadOp = cast<scf::ForeachThreadOp>(newForeachThreadOp);
+    Operation *newForallOp = rewriter.clone(*topLevelForallOp);
+    rewriter.eraseOp(topLevelForallOp);
+    topLevelForallOp = cast<scf::ForallOp>(newForallOp);
   }
 
   SmallVector<int64_t> gridDim = extractFromI64ArrayAttr(getGridDim());
@@ -340,11 +338,11 @@ transform::MapForeachToBlocks::applyToOne(Operation *target,
       GPUBlockMappingAttr::get(getContext(), Blocks::DimZ)};
 
   diag = checkAttributeType(blockMappingAttributes,
-                            topLevelForeachThreadOp.getMapping(), transformOp);
+                            topLevelForallOp.getMapping(), transformOp);
   if (diag.succeeded())
     diag = mlir::transform::gpu::mapForeachToBlocksImpl(
-        rewriter, topLevelForeachThreadOp, generateGpuBlockIds, gridDim,
-        transformOp, blockMappingAttributes);
+        rewriter, topLevelForallOp, generateGpuBlockIds, gridDim, transformOp,
+        blockMappingAttributes);
   if (diag.succeeded()) {
     diag = alterGpuLaunch(rewriter, gpuLaunch,
                           cast<TransformOpInterface>(getOperation()),
@@ -359,51 +357,50 @@ transform::MapForeachToBlocks::applyToOne(Operation *target,
 // MapNestedForeachToThreads
 //===----------------------------------------------------------------------===//
 
-/// Searches `scf.foreach_thread` ops nested under `target` and maps each such
+/// Searches `scf.forall` ops nested under `target` and maps each such
 /// op to GPU threads. Mapping is one-to-one and the induction variables of
-/// `scf.foreach_thread` are rewritten to gpu.thread_id according to the
-/// thread_dim_mapping attribute. Sibling `scf.foreach_thread` are supported in
+/// `scf.forall` are rewritten to gpu.thread_id according to the
+/// thread_dim_mapping attribute. Sibling `scf.forall` are supported in
 /// which case, the union of the number of threads is computed and may result
-/// in predication. Dynamic, `scf.foreach_thread` trip counts are currently
+/// in predication. Dynamic, `scf.forall` trip counts are currently
 /// not supported. Dynamic block dim sizes are currently not supported.
-static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
-    RewriterBase &rewriter, scf::ForeachThreadOp foreachThreadOp,
+static DiagnosedSilenceableFailure rewriteOneForallToGpuThreads(
+    RewriterBase &rewriter, scf::ForallOp forallOp,
     const SmallVectorImpl<int64_t> &globalBlockDims,
     const SmallVectorImpl<Value> &threadOps, bool syncAfterDistribute,
     std::optional<TransformOpInterface> transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes) {
   // Step 0. Target-specific verifications. There is no good place to anchor
-  // those right now: the ForeachThreadOp is target-independent and the
-  // transform op does not apply to individual ForeachThreadOp.
+  // those right now: the ForallOp is target-independent and the
+  // transform op does not apply to individual ForallOp.
   auto failureHelper =
       [&](const Twine &message) -> DiagnosedSilenceableFailure {
     if (transformOp.has_value()) {
       return transformOp->emitSilenceableError() << message;
     }
-    return emitDefiniteFailure(foreachThreadOp, message);
+    return emitDefiniteFailure(forallOp, message);
   };
-  Location loc = foreachThreadOp->getLoc();
-  if (!foreachThreadOp.isNormalized())
+  Location loc = forallOp->getLoc();
+  if (!forallOp.isNormalized())
     return failureHelper("unsupported non-normalized loops");
-  if (foreachThreadOp.getNumResults() > 0)
-    return failureHelper(
-        "only bufferized scf.foreach_thread lowers to gpu.thread_id");
-  if (foreachThreadOp.getRank() > 3)
+  if (forallOp.getNumResults() > 0)
+    return failureHelper("only bufferized scf.forall lowers to gpu.thread_id");
+  if (forallOp.getRank() > 3)
     return failureHelper(
-        "scf.foreach_thread with rank > 3 does not lower to gpu.thread_id");
-  if (llvm::any_of(foreachThreadOp.getMixedUpperBound(), [](OpFoldResult ofr) {
+        "scf.forall with rank > 3 does not lower to gpu.thread_id");
+  if (llvm::any_of(forallOp.getMixedUpperBound(), [](OpFoldResult ofr) {
         return !getConstantIntValue(ofr).has_value();
       })) {
     return failureHelper("unsupported dynamic blockdim size");
   }
-  if (!foreachThreadOp.getMapping().has_value())
+  if (!forallOp.getMapping().has_value())
     return failureHelper("mapping must be present");
   SmallVector<Attribute> threadMapping =
-      llvm::to_vector(foreachThreadOp.getMapping()->getValue());
+      llvm::to_vector(forallOp.getMapping()->getValue());
 
   // Step 1. Complete the threadMapping to a full mapping (with 1s) if
   // necessary.
-  SmallVector<Value> numThreads = foreachThreadOp.getUpperBound(rewriter);
+  SmallVector<Value> numThreads = forallOp.getUpperBound(rewriter);
   // Ensure we have 3 block sizes, one for each id.
   Value one;
   for (auto attr : threadMappingAttributes) {
@@ -420,9 +417,8 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
                         DeviceMappingAttrInterface b) -> bool {
     return a.getMappingId() < b.getMappingId();
   };
-  SmallVector<Value> blockDimValues =
-      scf::ForeachThreadOp::getValuesSortedByKey(threadMapping, numThreads,
-                                                 comparator);
+  SmallVector<Value> blockDimValues = scf::ForallOp::getValuesSortedByKey(
+      threadMapping, numThreads, comparator);
   SmallVector<int64_t> blockDims =
       llvm::to_vector(llvm::map_range(blockDimValues, [](Value v) {
         return v.getDefiningOp<arith::ConstantIndexOp>().value();
@@ -440,7 +436,7 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
   }
   IRMapping bvm;
   for (auto [blockIdx, blockDim] :
-       llvm::zip(foreachThreadOp.getInductionVars(), threadMapping)) {
+       llvm::zip(forallOp.getInductionVars(), threadMapping)) {
     bvm.map(blockIdx,
             threadOpsUpdated[blockDim.cast<DeviceMappingAttrInterface>()
                                  .getMappingId()]);
@@ -453,7 +449,7 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
     if (blockDim > globalBlockDim) {
       return failureHelper(
           "The requested GPU threads are fewer than the number of loop trip "
-          "counts. Try to tile scf.foreach_thread before mapping or set "
+          "counts. Try to tile scf.forall before mapping or set "
           "small blockDim.");
     }
     if (blockDim == globalBlockDim)
@@ -466,9 +462,9 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
                   : tmpPredicate;
   }
 
-  // Step 5. Move the body of foreachThreadOp.
+  // Step 5. Move the body of forallOp.
   // Erase the terminator first, it will not be used.
-  rewriter.eraseOp(foreachThreadOp.getTerminator());
+  rewriter.eraseOp(forallOp.getTerminator());
   Block *targetBlock;
   Block::iterator insertionPoint;
   if (predicate) {
@@ -478,16 +474,16 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
     targetBlock = ifOp.thenBlock();
     insertionPoint = ifOp.thenBlock()->begin();
   } else {
-    // Step 5.b. Otherwise, move inline just before foreachThreadOp.
-    targetBlock = foreachThreadOp->getBlock();
-    insertionPoint = Block::iterator(foreachThreadOp);
+    // Step 5.b. Otherwise, move inline just before forallOp.
+    targetBlock = forallOp->getBlock();
+    insertionPoint = Block::iterator(forallOp);
   }
-  Block &sourceBlock = foreachThreadOp.getRegion().front();
+  Block &sourceBlock = forallOp.getRegion().front();
   targetBlock->getOperations().splice(insertionPoint,
                                       sourceBlock.getOperations());
 
   // Step 6. RAUW thread indices to thread ops.
-  for (Value loopIndex : foreachThreadOp.getInductionVars()) {
+  for (Value loopIndex : forallOp.getInductionVars()) {
     Value threadIdx = bvm.lookup(loopIndex);
     rewriter.replaceAllUsesWith(loopIndex, threadIdx);
   }
@@ -498,7 +494,7 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
     rewriter.create<BarrierOp>(loc);
 
   // Step 8. Erase old op.
-  rewriter.eraseOp(foreachThreadOp);
+  rewriter.eraseOp(forallOp);
 
   return DiagnosedSilenceableFailure::success();
 }
@@ -506,28 +502,27 @@ static DiagnosedSilenceableFailure rewriteOneForeachThreadToGpuThreads(
 DiagnosedSilenceableFailure mlir::transform::gpu::mapNestedForeachToThreadsImpl(
     RewriterBase &rewriter, Operation *target,
     const SmallVectorImpl<int64_t> &blockDim,
-    function_ref<void(RewriterBase &, scf::ForeachThreadOp,
-                      SmallVectorImpl<Value> &)>
+    function_ref<void(RewriterBase &, scf::ForallOp, SmallVectorImpl<Value> &)>
         threadIdGenerator,
     bool syncAfterDistribute, std::optional<TransformOpInterface> transformOp,
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes) {
   DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
-  target->walk([&](scf::ForeachThreadOp foreachThreadOp) {
+  target->walk([&](scf::ForallOp forallOp) {
     // Ignore cases with 
diff erent attributes.
-    for (Attribute map : foreachThreadOp.getMapping()->getValue()) {
+    for (Attribute map : forallOp.getMapping()->getValue()) {
       if (!llvm::is_contained(threadMappingAttributes, map)) {
         return WalkResult::skip();
       }
     }
-    diag = checkAttributeType(threadMappingAttributes,
-                              foreachThreadOp.getMapping(), transformOp);
+    diag = checkAttributeType(threadMappingAttributes, forallOp.getMapping(),
+                              transformOp);
     if (diag.succeeded()) {
-      rewriter.setInsertionPoint(foreachThreadOp);
+      rewriter.setInsertionPoint(forallOp);
       SmallVector<Value> threadOps;
-      threadIdGenerator(rewriter, foreachThreadOp, threadOps);
-      diag = rewriteOneForeachThreadToGpuThreads(
-          rewriter, foreachThreadOp, blockDim, threadOps, syncAfterDistribute,
-          transformOp, threadMappingAttributes);
+      threadIdGenerator(rewriter, forallOp, threadOps);
+      diag = rewriteOneForallToGpuThreads(rewriter, forallOp, blockDim,
+                                          threadOps, syncAfterDistribute,
+                                          transformOp, threadMappingAttributes);
     }
     return diag.succeeded() ? WalkResult::advance() : WalkResult::interrupt();
   });
@@ -562,16 +557,15 @@ DiagnosedSilenceableFailure transform::MapNestedForeachToThreads::applyToOne(
       GPUThreadMappingAttr::get(ctx, Threads::DimX),
       GPUThreadMappingAttr::get(ctx, Threads::DimY),
       GPUThreadMappingAttr::get(ctx, Threads::DimZ)};
-  auto threadIdGenerator = [](RewriterBase &rewriter,
-                              scf::ForeachThreadOp foreachThreadOp,
+  auto threadIdGenerator = [](RewriterBase &rewriter, scf::ForallOp forallOp,
                               SmallVectorImpl<Value> &threadIds) {
     IndexType indexType = rewriter.getIndexType();
-    threadIds.assign({rewriter.create<ThreadIdOp>(foreachThreadOp->getLoc(),
-                                                  indexType, Dimension::x),
-                      rewriter.create<ThreadIdOp>(foreachThreadOp->getLoc(),
-                                                  indexType, Dimension::y),
-                      rewriter.create<ThreadIdOp>(foreachThreadOp->getLoc(),
-                                                  indexType, Dimension::z)});
+    threadIds.assign({rewriter.create<ThreadIdOp>(forallOp->getLoc(), indexType,
+                                                  Dimension::x),
+                      rewriter.create<ThreadIdOp>(forallOp->getLoc(), indexType,
+                                                  Dimension::y),
+                      rewriter.create<ThreadIdOp>(forallOp->getLoc(), indexType,
+                                                  Dimension::z)});
   };
   diag = mlir::transform::gpu::mapNestedForeachToThreadsImpl(
       rewriter, target, blockDim, threadIdGenerator, getSyncAfterDistribute(),

diff  --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 82c4c39032001..7fd290864b3bb 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -482,7 +482,7 @@ static Operation *tileAndFuseFirstExtractUse(RewriterBase &rewriter,
   return fusedOp;
 }
 
-/// First, find the first "scf::ForeachThreadOp" user of `producerOp` and ensure
+/// First, find the first "scf::ForallOp" user of `producerOp` and ensure
 /// it is exactly the `containingOp`, otherwise bail.
 /// Then, find the first "extract" user of the tied block argument and tile it
 /// right before its "extract" use. The tiled op is fused under the
@@ -500,15 +500,15 @@ static Operation *tileAndFuseFirstExtractUseThroughContainingOpBlockArgument(
     return nullptr;
   }
 
-  // Search the first use by a "scf::ForeachThreadOp" user.
-  scf::ForeachThreadOp foreachThreadOp;
+  // Search the first use by a "scf::ForallOp" user.
+  scf::ForallOp forallOp;
   auto itProducerUses =
       llvm::find_if(tileableProducer->getUses(), [&](OpOperand &use) {
-        foreachThreadOp = dyn_cast<scf::ForeachThreadOp>(use.getOwner());
-        return foreachThreadOp;
+        forallOp = dyn_cast<scf::ForallOp>(use.getOwner());
+        return forallOp;
       });
   // If it's not from the containing op, return.
-  if (!foreachThreadOp || foreachThreadOp != containingOp) {
+  if (!forallOp || forallOp != containingOp) {
     diag.attachNote(tileableProducer->getLoc())
         << "could not find a use by the containing op: " << *tileableProducer;
     return nullptr;
@@ -519,7 +519,7 @@ static Operation *tileAndFuseFirstExtractUseThroughContainingOpBlockArgument(
   // TODO: Generalize to more extract/insert/parallel_insert triples.
   //   Maybe evolve into an interface.
   OpOperand *pUse = &(*itProducerUses);
-  BlockArgument bbArg = foreachThreadOp.getTiedBlockArgument(pUse);
+  BlockArgument bbArg = forallOp.getTiedBlockArgument(pUse);
 
   // Search the producer slices accessed within the containing operation.
   // TODO: Generalize to more extract/insert/parallel_insert triples, maybe
@@ -2188,7 +2188,7 @@ void transform::TileReductionUsingScfOp::build(
   // This is future-proof re mixed static-dynamic and setting up the proper
   // operands segment sizes attributes for multiple variadic operands.
   // In the absence of this, horrible bugs ensue.
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   MLIRContext *ctx = builder.getContext();
   auto opTy = pdl::OperationType::get(ctx);
   auto staticTileSizesAttr = builder.getDenseI64ArrayAttr(staticTileSizes);
@@ -2217,10 +2217,10 @@ DiagnosedSilenceableFailure transform::TileReductionUsingScfOp::applyToOne(
 }
 
 //===----------------------------------------------------------------------===//
-// TileReductionUsingForeachThreadOp
+// TileReductionUsingForallOp
 //===----------------------------------------------------------------------===//
 
-void transform::TileReductionUsingForeachThreadOp::build(
+void transform::TileReductionUsingForallOp::build(
     OpBuilder &builder, OperationState &result, Value target,
     ArrayRef<int64_t> staticNumThreads, ArrayRef<int64_t> staticTileSizes,
     ArrayAttr mapping) {
@@ -2228,7 +2228,7 @@ void transform::TileReductionUsingForeachThreadOp::build(
   // This is future-proof re mixed static-dynamic and setting up the proper
   // operands segment sizes attributes for multiple variadic operands.
   // In the absence of this, horrible bugs ensue.
-  // TODO: support mixed static-dynamic (see TileToForeachThreadOp).
+  // TODO: support mixed static-dynamic (see TileToForallOp).
   MLIRContext *ctx = builder.getContext();
   auto opTy = pdl::OperationType::get(ctx);
   auto staticNumThreadsAttr = builder.getDenseI64ArrayAttr(staticNumThreads);
@@ -2241,8 +2241,7 @@ void transform::TileReductionUsingForeachThreadOp::build(
         /*mapping=*/mapping);
 }
 
-DiagnosedSilenceableFailure
-transform::TileReductionUsingForeachThreadOp::applyToOne(
+DiagnosedSilenceableFailure transform::TileReductionUsingForallOp::applyToOne(
     LinalgOp target, transform::ApplyToEachResultList &results,
     transform::TransformState &state) {
   TrivialPatternRewriter rewriter(getContext());
@@ -2251,8 +2250,8 @@ transform::TileReductionUsingForeachThreadOp::applyToOne(
       getAsOpFoldResult(rewriter.getI64ArrayAttr(getNumThreads()));
   SmallVector<OpFoldResult> tileSizes =
       getAsOpFoldResult(rewriter.getI64ArrayAttr(getTileSizes()));
-  FailureOr<linalg::ForeachThreadReductionTilingResult> result =
-      linalg::tileReductionUsingForeachThread(
+  FailureOr<linalg::ForallReductionTilingResult> result =
+      linalg::tileReductionUsingForall(
           rewriter, cast<PartialReductionOpInterface>(target.getOperation()),
           numThreads, tileSizes, getMapping());
 
@@ -2553,15 +2552,14 @@ void transform::TileOp::getEffects(
 }
 
 //===----------------------------------------------------------------------===//
-// TileToForeachThreadOp
+// TileToForallOp
 //===----------------------------------------------------------------------===//
 
-void transform::TileToForeachThreadOp::build(OpBuilder &builder,
-                                             OperationState &result,
-                                             Value target,
-                                             ArrayRef<int64_t> staticTileSizes,
-                                             transform::TileSizesSpec,
-                                             ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<int64_t> staticTileSizes,
+                                      transform::TileSizesSpec,
+                                      ArrayAttr mapping) {
   return build(builder, result,
                /*target=*/target,
                /*mixedTileSizes=*/
@@ -2570,10 +2568,11 @@ void transform::TileToForeachThreadOp::build(OpBuilder &builder,
                /*mapping=*/mapping);
 }
 
-void transform::TileToForeachThreadOp::build(
-    OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<OpFoldResult> mixedTileSizes, transform::TileSizesSpec,
-    ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<OpFoldResult> mixedTileSizes,
+                                      transform::TileSizesSpec,
+                                      ArrayAttr mapping) {
   SmallVector<int64_t> staticTileSizes;
   SmallVector<Value> dynamicTileSizes;
   dispatchIndexOpFoldResults(mixedTileSizes, dynamicTileSizes, staticTileSizes);
@@ -2595,21 +2594,21 @@ void transform::TileToForeachThreadOp::build(
         /*mapping=*/mapping);
 }
 
-void transform::TileToForeachThreadOp::build(OpBuilder &builder,
-                                             OperationState &result,
-                                             Value target,
-                                             ArrayRef<int64_t> staticNumThreads,
-                                             transform::NumThreadsSpec,
-                                             ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<int64_t> staticNumThreads,
+                                      transform::NumThreadsSpec,
+                                      ArrayAttr mapping) {
   return build(builder, result, target,
                getAsOpFoldResult(builder.getI64ArrayAttr(staticNumThreads)),
                NumThreadsSpec(), mapping);
 }
 
-void transform::TileToForeachThreadOp::build(
-    OpBuilder &builder, OperationState &result, Value target,
-    ArrayRef<OpFoldResult> mixedNumThreads, transform::NumThreadsSpec,
-    ArrayAttr mapping) {
+void transform::TileToForallOp::build(OpBuilder &builder,
+                                      OperationState &result, Value target,
+                                      ArrayRef<OpFoldResult> mixedNumThreads,
+                                      transform::NumThreadsSpec,
+                                      ArrayAttr mapping) {
   SmallVector<int64_t> staticNumThreads;
   SmallVector<Value> dynamicNumThreads;
   dispatchIndexOpFoldResults(mixedNumThreads, dynamicNumThreads,
@@ -2632,7 +2631,7 @@ void transform::TileToForeachThreadOp::build(
         /*mapping=*/mapping);
 }
 
-DiagnosedSilenceableFailure transform::tileToForeachThreadOpImpl(
+DiagnosedSilenceableFailure transform::tileToForallOpImpl(
     RewriterBase &rewriter, transform::TransformState &state,
     TransformOpInterface transformOp, ArrayRef<Operation *> targets,
     ArrayRef<OpFoldResult> mixedNumThreads,
@@ -2652,12 +2651,12 @@ DiagnosedSilenceableFailure transform::tileToForeachThreadOpImpl(
       return diag;
     }
     rewriter.setInsertionPoint(tileableOp);
-    FailureOr<linalg::ForeachThreadTilingResult> tilingResult = failure();
+    FailureOr<linalg::ForallTilingResult> tilingResult = failure();
     if (!mixedNumThreads.empty()) {
-      tilingResult = linalg::tileToForeachThreadOp(rewriter, tileableOp,
-                                                   mixedNumThreads, mapping);
+      tilingResult = linalg::tileToForallOp(rewriter, tileableOp,
+                                            mixedNumThreads, mapping);
     } else {
-      tilingResult = linalg::tileToForeachThreadOpUsingTileSizes(
+      tilingResult = linalg::tileToForallOpUsingTileSizes(
           rewriter, tileableOp, mixedTileSizes, mapping);
     }
 
@@ -2671,9 +2670,9 @@ DiagnosedSilenceableFailure transform::tileToForeachThreadOpImpl(
   return DiagnosedSilenceableFailure::success();
 }
 
-DiagnosedSilenceableFailure transform::TileToForeachThreadOp::apply(
-    transform::TransformResults &transformResults,
-    transform::TransformState &state) {
+DiagnosedSilenceableFailure
+transform::TileToForallOp::apply(transform::TransformResults &transformResults,
+                                 transform::TransformState &state) {
   IRRewriter rewriter(getContext());
   auto transformOp = cast<TransformOpInterface>(getOperation());
   ArrayRef<Operation *> targets = state.getPayloadOps(getTarget());
@@ -2701,20 +2700,20 @@ DiagnosedSilenceableFailure transform::TileToForeachThreadOp::apply(
   if (!status.succeeded())
     return status;
 
-  DiagnosedSilenceableFailure diag = tileToForeachThreadOpImpl(
-      rewriter, state, transformOp, targets, mixedNumThreads, mixedTileSizes,
-      getMapping(), tileOps, tiledOps);
+  DiagnosedSilenceableFailure diag =
+      tileToForallOpImpl(rewriter, state, transformOp, targets, mixedNumThreads,
+                         mixedTileSizes, getMapping(), tileOps, tiledOps);
 
   if (!diag.succeeded())
     return diag;
 
-  transformResults.set(getForeachThreadOp().cast<OpResult>(), tileOps);
+  transformResults.set(getForallOp().cast<OpResult>(), tileOps);
   transformResults.set(getTiledOp().cast<OpResult>(), tiledOps);
 
   return DiagnosedSilenceableFailure::success();
 }
 
-void transform::TileToForeachThreadOp::getEffects(
+void transform::TileToForallOp::getEffects(
     SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
   consumesHandle(getTarget(), effects);
   onlyReadsHandle(getTileSizes(), effects);
@@ -2725,17 +2724,17 @@ void transform::TileToForeachThreadOp::getEffects(
   modifiesPayload(effects);
 }
 
-SmallVector<OpFoldResult> TileToForeachThreadOp::getMixedNumThreads() {
+SmallVector<OpFoldResult> TileToForallOp::getMixedNumThreads() {
   Builder b(getContext());
   return getMixedValues(getStaticNumThreads(), getNumThreads(), b);
 }
 
-SmallVector<OpFoldResult> TileToForeachThreadOp::getMixedTileSizes() {
+SmallVector<OpFoldResult> TileToForallOp::getMixedTileSizes() {
   Builder b(getContext());
   return getMixedValues(getStaticTileSizes(), getTileSizes(), b);
 }
 
-LogicalResult TileToForeachThreadOp::verify() {
+LogicalResult TileToForallOp::verify() {
   int numThreadsSpec = static_cast<int>(!getMixedNumThreads().empty()) +
                        static_cast<int>(getPackedNumThreads() != Value());
   if (numThreadsSpec > 1)

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 10a1451fb7cc3..50f89cfeec145 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -244,16 +244,16 @@ static OpFoldResult buildMin(OpBuilder &b, Location loc,
 /// Fill out the `tiledOffsets` and `tiledSizes` to be used to tile to a given
 /// number of threads.
 static void calculateTileOffsetsAndSizes(
-    RewriterBase &b, Location loc, scf::ForeachThreadOp foreachThreadOp,
+    RewriterBase &b, Location loc, scf::ForallOp forallOp,
     ArrayRef<OpFoldResult> numThreads, SmallVector<Range> loopRanges,
     bool omitTileOffsetBoundsCheck,
     std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
     SmallVector<OpFoldResult> &tiledOffsets,
     SmallVector<OpFoldResult> &tiledSizes) {
   OpBuilder::InsertionGuard g(b);
-  b.setInsertionPointToStart(foreachThreadOp.getBody(0));
+  b.setInsertionPointToStart(forallOp.getBody(0));
 
-  ValueRange threadIds = foreachThreadOp.getInductionVars();
+  ValueRange threadIds = forallOp.getInductionVars();
   SmallVector<OpFoldResult> nonZeroNumThreads =
       llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
         return !isConstantIntValue(ofr, 0);
@@ -314,19 +314,19 @@ static void calculateTileOffsetsAndSizes(
   }
 }
 
-/// Rewrite a TilingInterface `op` to a tiled `scf.foreach_thread`. The
+/// Rewrite a TilingInterface `op` to a tiled `scf.forall`. The
 /// tiling is specified by the number of tiles/threads `numThreads` and the
 /// optional nominal tile size `nominalTileSizes`. If `nominalTilSizes` is
 /// not specified, then  it is derived from `numThreads` as `ceilDiv(dimSize[i],
 /// numThreads[i])`. If non-empty, the `mapping` is added as an
-/// attribute to the resulting `scf.foreach_thread`. A zero tile sizes indicate
+/// attribute to the resulting `scf.forall`. A zero tile sizes indicate
 /// that the dimension is not tiled, and can be thought of as tiling by the full
 /// size of data.
 /// It is the user's responsibility to ensure that `numThreads` is a valid
 /// tiling specification (i.e. that only tiles parallel dimensions, e.g. in the
 /// Linalg case). If `omitTileOffsetBoundsCheck` is true, then the function will
 /// assume that `tileSize[i] * (numThread[i] -1) <= dimSize[i]` holds.
-static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
+static FailureOr<ForallTilingResult> tileToForallOpImpl(
     RewriterBase &b, TilingInterface op, ArrayRef<OpFoldResult> numThreads,
     std::optional<ArrayRef<OpFoldResult>> nominalTileSizes,
     std::optional<ArrayAttr> mapping, bool omitTileOffsetBoundsCheck) {
@@ -356,26 +356,25 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
 
   Operation *tiledOp = nullptr;
 
-  // 1. Create the ForeachThreadOp. We don't use the lambda body-builder
+  // 1. Create the ForallOp. We don't use the lambda body-builder
   // version because we require the use of RewriterBase in the body, so we
   // manually move the insertion point to the body below.
-  scf::ForeachThreadOp foreachThreadOp = b.create<scf::ForeachThreadOp>(
+  scf::ForallOp forallOp = b.create<scf::ForallOp>(
       loc, getAsOpFoldResult((materializedNonZeroNumThreads)), dest, mapping);
 
-  // 2. Fill out the ForeachThreadOp body.
+  // 2. Fill out the ForallOp body.
   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
-  calculateTileOffsetsAndSizes(b, loc, foreachThreadOp, numThreads, loopRanges,
+  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, loopRanges,
                                omitTileOffsetBoundsCheck, nominalTileSizes,
                                tiledOffsets, tiledSizes);
 
   // 3. Clone the tileable op and update its destination operands to use the
-  // output bbArgs of the ForeachThreadOp.
-  ArrayRef<BlockArgument> destBbArgs =
-      foreachThreadOp.getOutputBlockArguments();
+  // output bbArgs of the ForallOp.
+  ArrayRef<BlockArgument> destBbArgs = forallOp.getOutputBlockArguments();
   {
-    // 3.a. RAII guard, inserting within foreachThreadOp, before terminator.
+    // 3.a. RAII guard, inserting within forallOp, before terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
     Operation *clonedOp = b.clone(*op.getOperation());
     auto destinationStyleOp = dyn_cast<DestinationStyleOpInterface>(clonedOp);
     if (destinationStyleOp) {
@@ -404,7 +403,7 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
                            tilingInterfaceOp->getResults(), destBbArgs)) {
     // 5.a. Partial subset information is inserted just before the terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
 
     SmallVector<OpFoldResult> resultOffsets, resultSizes;
     if (failed(op.getResultTilePosition(b, std::get<0>(it), tiledOffsets,
@@ -415,27 +414,27 @@ static FailureOr<ForeachThreadTilingResult> tileToForeachThreadOpImpl(
 
     // 5.b. Parallel insertions are inserted at the end of the combining
     // terminator.
-    b.setInsertionPointToEnd(foreachThreadOp.getTerminator().getBody());
+    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
     b.create<tensor::ParallelInsertSliceOp>(loc, std::get<1>(it),
                                             std::get<2>(it), resultOffsets,
                                             resultSizes, strides);
   }
-  return ForeachThreadTilingResult{foreachThreadOp, tiledOp};
+  return ForallTilingResult{forallOp, tiledOp};
 }
 
-FailureOr<ForeachThreadTilingResult>
-linalg::tileToForeachThreadOp(RewriterBase &b, TilingInterface op,
-                              ArrayRef<OpFoldResult> numThreads,
-                              std::optional<ArrayAttr> mapping) {
-  return tileToForeachThreadOpImpl(b, op, numThreads,
-                                   /*nominalTileSizes=*/std::nullopt, mapping,
-                                   /*omitTileOffsetBoundsCheck=*/false);
+FailureOr<ForallTilingResult>
+linalg::tileToForallOp(RewriterBase &b, TilingInterface op,
+                       ArrayRef<OpFoldResult> numThreads,
+                       std::optional<ArrayAttr> mapping) {
+  return tileToForallOpImpl(b, op, numThreads,
+                            /*nominalTileSizes=*/std::nullopt, mapping,
+                            /*omitTileOffsetBoundsCheck=*/false);
 }
 
-FailureOr<ForeachThreadTilingResult>
-linalg::tileToForeachThreadOpUsingTileSizes(RewriterBase &b, TilingInterface op,
-                                            ArrayRef<OpFoldResult> tileSizes,
-                                            std::optional<ArrayAttr> mapping) {
+FailureOr<ForallTilingResult>
+linalg::tileToForallOpUsingTileSizes(RewriterBase &b, TilingInterface op,
+                                     ArrayRef<OpFoldResult> tileSizes,
+                                     std::optional<ArrayAttr> mapping) {
   SmallVector<Range> loopRanges = op.getIterationDomain(b);
   unsigned nLoops = loopRanges.size();
   SmallVector<OpFoldResult> numThreads;
@@ -450,9 +449,9 @@ linalg::tileToForeachThreadOpUsingTileSizes(RewriterBase &b, TilingInterface op,
           b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
     numThreads.push_back(numTiles);
   }
-  return tileToForeachThreadOpImpl(b, op, numThreads,
-                                   /*nominalTileSizes=*/tileSizes, mapping,
-                                   /*omitTileOffsetBoundsCheck=*/true);
+  return tileToForallOpImpl(b, op, numThreads,
+                            /*nominalTileSizes=*/tileSizes, mapping,
+                            /*omitTileOffsetBoundsCheck=*/true);
 }
 
 template <typename LoopTy>
@@ -608,12 +607,10 @@ tileLinalgOpImpl(RewriterBase &b, LinalgOp op, ArrayRef<OpFoldResult> tileSizes,
       res, loops, outermostLoop ? outermostLoop->getResults() : tensorResults};
 }
 
-FailureOr<linalg::ForeachThreadReductionTilingResult>
-linalg::tileReductionUsingForeachThread(RewriterBase &b,
-                                        PartialReductionOpInterface op,
-                                        ArrayRef<OpFoldResult> numThreads,
-                                        ArrayRef<OpFoldResult> tileSizes,
-                                        std::optional<ArrayAttr> mapping) {
+FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
+    RewriterBase &b, PartialReductionOpInterface op,
+    ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes,
+    std::optional<ArrayAttr> mapping) {
   Location loc = op.getLoc();
   OpBuilder::InsertionGuard g(b);
 
@@ -679,28 +676,27 @@ linalg::tileReductionUsingForeachThread(RewriterBase &b,
   SmallVector<Value> materializedNonZeroNumThreads =
       getAsValues(b, loc, nonZeroNumThreads);
 
-  // 2. Create the ForeachThreadOp with an empty region.
-  scf::ForeachThreadOp foreachThreadOp = b.create<scf::ForeachThreadOp>(
+  // 2. Create the ForallOp with an empty region.
+  scf::ForallOp forallOp = b.create<scf::ForallOp>(
       loc, getAsOpFoldResult(materializedNonZeroNumThreads),
       (*identityTensor)->getResults(), mapping);
 
   // 3. Calculate the tile offsets and sizes for the subsequent loop that will
-  // be nested under `foreachThreadOp`.
+  // be nested under `forallOp`.
   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
-  calculateTileOffsetsAndSizes(
-      b, loc, foreachThreadOp, numThreads, iterationDomain,
-      /*omitTileOffsetBoundsCheck =*/false,
-      /*nominalTileSizes=*/std::nullopt, tiledOffsets, tiledSizes);
+  calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, iterationDomain,
+                               /*omitTileOffsetBoundsCheck =*/false,
+                               /*nominalTileSizes=*/std::nullopt, tiledOffsets,
+                               tiledSizes);
 
   // 4. Clone the tileable op and update its destination operands to use the
-  // output bbArgs of the ForeachThreadOp.
+  // output bbArgs of the ForallOp.
   ValueRange tilingResults;
-  ArrayRef<BlockArgument> destBbArgs =
-      foreachThreadOp.getOutputBlockArguments();
+  ArrayRef<BlockArgument> destBbArgs = forallOp.getOutputBlockArguments();
   {
-    // 4.a. RAII guard, inserting within foreachThreadOp, before terminator.
+    // 4.a. RAII guard, inserting within forallOp, before terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
 
     SmallVector<Value> tiledDpsInitOperands;
     for (OpOperand *initOperand : destinationStyleOp.getDpsInitOperands()) {
@@ -712,7 +708,7 @@ linalg::tileReductionUsingForeachThread(RewriterBase &b,
                                            b.getIndexAttr(0));
       SmallVector<OpFoldResult> sizes = tiledSizes;
       sizes[reductionDim] = b.getIndexAttr(1);
-      outOffsets[reductionDim] = foreachThreadOp.getInductionVars().front();
+      outOffsets[reductionDim] = forallOp.getInductionVars().front();
       // TODO: use SubsetExtractOpInterface once it is available.
       tiledDpsInitOperands.push_back(b.create<tensor::ExtractSliceOp>(
           loc, initOperand->get().getType().cast<RankedTensorType>(),
@@ -746,7 +742,7 @@ linalg::tileReductionUsingForeachThread(RewriterBase &b,
       if (failed(maybeTiled))
         return b.notifyMatchFailure(op, "failed tileLinalgOpImpl");
 
-      SmallVector<Value> ids = foreachThreadOp.getInductionVars();
+      SmallVector<Value> ids = forallOp.getInductionVars();
       mapLoopToProcessorIds(cast<scf::ForOp>(maybeTiled->loops.back()), ids,
                             materializedNonZeroNumThreads);
       assert(maybeTiled->loops.size() == 1 &&
@@ -763,7 +759,7 @@ linalg::tileReductionUsingForeachThread(RewriterBase &b,
            llvm::seq<unsigned>(0, dest.size()), tilingResults, destBbArgs)) {
     // 6.a. Partial subset information is inserted just before the terminator.
     OpBuilder::InsertionGuard g(b);
-    b.setInsertionPoint(foreachThreadOp.getTerminator());
+    b.setInsertionPoint(forallOp.getTerminator());
 
     SmallVector<OpFoldResult> resultOffsets, resultSizes;
     if (failed(tilingInterfaceOp.getResultTilePosition(
@@ -774,7 +770,7 @@ linalg::tileReductionUsingForeachThread(RewriterBase &b,
     int64_t sizeIdx = 0;
     for (int64_t i = 0, e = numThreads.size(); i < e; ++i) {
       if (i == reductionDim) {
-        resultOffsetsRank.push_back(foreachThreadOp.getInductionVars().front());
+        resultOffsetsRank.push_back(forallOp.getInductionVars().front());
         resultSizesRank.push_back(b.getIndexAttr(1));
         continue;
       }
@@ -786,21 +782,21 @@ linalg::tileReductionUsingForeachThread(RewriterBase &b,
 
     // 6.b. Parallel insertions are inserted at the end of the combining
     // terminator.
-    b.setInsertionPointToEnd(foreachThreadOp.getTerminator().getBody());
+    b.setInsertionPointToEnd(forallOp.getTerminator().getBody());
     b.create<tensor::ParallelInsertSliceOp>(
         loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
   }
 
   // 7. Merge the partial reductions.
-  b.setInsertionPointAfter(foreachThreadOp);
+  b.setInsertionPointAfter(forallOp);
   Operation *mergeOp =
-      op.mergeReductions(b, loc, foreachThreadOp->getResults(), reductionDim);
+      op.mergeReductions(b, loc, forallOp->getResults(), reductionDim);
   b.replaceOp(op, mergeOp->getResults());
 
   // 8. Return.
-  ForeachThreadReductionTilingResult results;
+  ForallReductionTilingResult results;
   results.initialOp = *identityTensor;
-  results.loops = foreachThreadOp;
+  results.loops = forallOp;
   results.parallelTiledOp = tiledOp;
   results.mergeOp = mergeOp;
   return results;

diff  --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 9032f533e5fbe..f1d07c8a5c56f 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -1106,10 +1106,10 @@ Speculation::Speculatability ForOp::getSpeculatability() {
 }
 
 //===----------------------------------------------------------------------===//
-// ForeachThreadOp
+// ForallOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult ForeachThreadOp::verify() {
+LogicalResult ForallOp::verify() {
   unsigned numLoops = getRank();
   // Check number of outputs.
   if (getNumResults() != getOutputs().size())
@@ -1156,7 +1156,7 @@ LogicalResult ForeachThreadOp::verify() {
   return success();
 }
 
-void ForeachThreadOp::print(OpAsmPrinter &p) {
+void ForallOp::print(OpAsmPrinter &p) {
   Operation *op = getOperation();
   p << " (" << getInductionVars();
   if (isNormalized()) {
@@ -1187,8 +1187,7 @@ void ForeachThreadOp::print(OpAsmPrinter &p) {
                                            getStaticStepAttrName()});
 }
 
-ParseResult ForeachThreadOp::parse(OpAsmParser &parser,
-                                   OperationState &result) {
+ParseResult ForallOp::parse(OpAsmParser &parser, OperationState &result) {
   OpBuilder b(parser.getContext());
   auto indexType = b.getIndexType();
 
@@ -1267,7 +1266,7 @@ ParseResult ForeachThreadOp::parse(OpAsmParser &parser,
     return failure();
 
   // Ensure terminator and move region.
-  ForeachThreadOp::ensureTerminator(*region, b, result.location);
+  ForallOp::ensureTerminator(*region, b, result.location);
   result.addRegion(std::move(region));
 
   // Parse the optional attribute list.
@@ -1287,7 +1286,7 @@ ParseResult ForeachThreadOp::parse(OpAsmParser &parser,
 }
 
 // Builder that takes loop bounds.
-void ForeachThreadOp::build(
+void ForallOp::build(
     mlir::OpBuilder &b, mlir::OperationState &result,
     ArrayRef<OpFoldResult> lbs, ArrayRef<OpFoldResult> ubs,
     ArrayRef<OpFoldResult> steps, ValueRange outputs,
@@ -1318,7 +1317,7 @@ void ForeachThreadOp::build(
                               static_cast<int32_t>(dynamicSteps.size()),
                               static_cast<int32_t>(outputs.size())}));
   if (mapping.has_value()) {
-    result.addAttribute(ForeachThreadOp::getMappingAttrName(result.name),
+    result.addAttribute(ForallOp::getMappingAttrName(result.name),
                         mapping.value());
   }
 
@@ -1337,20 +1336,19 @@ void ForeachThreadOp::build(
 
   b.setInsertionPointToStart(&bodyBlock);
   if (!bodyBuilderFn) {
-    ForeachThreadOp::ensureTerminator(*bodyRegion, b, result.location);
+    ForallOp::ensureTerminator(*bodyRegion, b, result.location);
     return;
   }
   bodyBuilderFn(b, result.location, bodyBlock.getArguments());
 #ifndef NDEBUG
-  auto terminator =
-      llvm::dyn_cast<PerformConcurrentlyOp>(bodyBlock.getTerminator());
+  auto terminator = llvm::dyn_cast<InParallelOp>(bodyBlock.getTerminator());
   assert(terminator &&
-         "expected bodyBuilderFn to create PerformConcurrentlyOp terminator");
+         "expected bodyBuilderFn to create InParallelOp terminator");
 #endif // NDEBUG
 }
 
 // Builder that takes loop bounds.
-void ForeachThreadOp::build(
+void ForallOp::build(
     mlir::OpBuilder &b, mlir::OperationState &result,
     ArrayRef<OpFoldResult> ubs, ValueRange outputs,
     std::optional<ArrayAttr> mapping,
@@ -1362,7 +1360,7 @@ void ForeachThreadOp::build(
 }
 
 // Checks if the lbs are zeros and steps are ones.
-bool ForeachThreadOp::isNormalized() {
+bool ForallOp::isNormalized() {
   auto allEqual = [](ArrayRef<OpFoldResult> results, int64_t val) {
     return llvm::all_of(results, [&](OpFoldResult ofr) {
       auto intValue = getConstantIntValue(ofr);
@@ -1375,22 +1373,22 @@ bool ForeachThreadOp::isNormalized() {
 // The ensureTerminator method generated by SingleBlockImplicitTerminator is
 // unaware of the fact that our terminator also needs a region to be
 // well-formed. We override it here to ensure that we do the right thing.
-void ForeachThreadOp::ensureTerminator(Region &region, OpBuilder &builder,
-                                       Location loc) {
-  OpTrait::SingleBlockImplicitTerminator<PerformConcurrentlyOp>::Impl<
-      ForeachThreadOp>::ensureTerminator(region, builder, loc);
+void ForallOp::ensureTerminator(Region &region, OpBuilder &builder,
+                                Location loc) {
+  OpTrait::SingleBlockImplicitTerminator<InParallelOp>::Impl<
+      ForallOp>::ensureTerminator(region, builder, loc);
   auto terminator =
-      llvm::dyn_cast<PerformConcurrentlyOp>(region.front().getTerminator());
+      llvm::dyn_cast<InParallelOp>(region.front().getTerminator());
   if (terminator.getRegion().empty())
     builder.createBlock(&terminator.getRegion());
 }
 
-PerformConcurrentlyOp ForeachThreadOp::getTerminator() {
-  return cast<PerformConcurrentlyOp>(getBody()->getTerminator());
+InParallelOp ForallOp::getTerminator() {
+  return cast<InParallelOp>(getBody()->getTerminator());
 }
 
 /// Helper to sort `values` according to matching `keys`.
-SmallVector<Value> ForeachThreadOp::getValuesSortedByKey(
+SmallVector<Value> ForallOp::getValuesSortedByKey(
     ArrayRef<Attribute> keys, ValueRange values,
     llvm::function_ref<bool(Attribute, Attribute)> compare) {
   if (keys.empty())
@@ -1406,28 +1404,27 @@ SmallVector<Value> ForeachThreadOp::getValuesSortedByKey(
   return res;
 }
 
-ForeachThreadOp mlir::scf::getForeachThreadOpThreadIndexOwner(Value val) {
+ForallOp mlir::scf::getForallOpThreadIndexOwner(Value val) {
   auto tidxArg = val.dyn_cast<BlockArgument>();
   if (!tidxArg)
-    return ForeachThreadOp();
+    return ForallOp();
   assert(tidxArg.getOwner() && "unlinked block argument");
   auto *containingOp = tidxArg.getOwner()->getParentOp();
-  return dyn_cast<ForeachThreadOp>(containingOp);
+  return dyn_cast<ForallOp>(containingOp);
 }
 
 namespace {
-/// Fold tensor.dim(foreach_thread shared_outs(... = %t)) to tensor.dim(%t).
-struct DimOfForeachThreadOp : public OpRewritePattern<tensor::DimOp> {
+/// Fold tensor.dim(forall shared_outs(... = %t)) to tensor.dim(%t).
+struct DimOfForallOp : public OpRewritePattern<tensor::DimOp> {
   using OpRewritePattern<tensor::DimOp>::OpRewritePattern;
 
   LogicalResult matchAndRewrite(tensor::DimOp dimOp,
                                 PatternRewriter &rewriter) const final {
-    auto foreachThreadOp = dimOp.getSource().getDefiningOp<ForeachThreadOp>();
-    if (!foreachThreadOp)
+    auto forallOp = dimOp.getSource().getDefiningOp<ForallOp>();
+    if (!forallOp)
       return failure();
     Value sharedOut =
-        foreachThreadOp.getTiedOpOperand(dimOp.getSource().cast<OpResult>())
-            ->get();
+        forallOp.getTiedOpOperand(dimOp.getSource().cast<OpResult>())->get();
     rewriter.updateRootInPlace(
         dimOp, [&]() { dimOp.getSourceMutable().assign(sharedOut); });
     return success();
@@ -1435,29 +1432,29 @@ struct DimOfForeachThreadOp : public OpRewritePattern<tensor::DimOp> {
 };
 } // namespace
 
-void ForeachThreadOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                                  MLIRContext *context) {
-  results.add<DimOfForeachThreadOp>(context);
+void ForallOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                           MLIRContext *context) {
+  results.add<DimOfForallOp>(context);
 }
 
 //===----------------------------------------------------------------------===//
-// PerformConcurrentlyOp
+// InParallelOp
 //===----------------------------------------------------------------------===//
 
-// Build a PerformConcurrentlyOp with mixed static and dynamic entries.
-void PerformConcurrentlyOp::build(OpBuilder &b, OperationState &result) {
+// Build a InParallelOp with mixed static and dynamic entries.
+void InParallelOp::build(OpBuilder &b, OperationState &result) {
   OpBuilder::InsertionGuard g(b);
   Region *bodyRegion = result.addRegion();
   b.createBlock(bodyRegion);
 }
 
-LogicalResult PerformConcurrentlyOp::verify() {
-  scf::ForeachThreadOp foreachThreadOp =
-      dyn_cast<scf::ForeachThreadOp>(getOperation()->getParentOp());
-  if (!foreachThreadOp)
-    return this->emitOpError("expected foreach_thread op parent");
+LogicalResult InParallelOp::verify() {
+  scf::ForallOp forallOp =
+      dyn_cast<scf::ForallOp>(getOperation()->getParentOp());
+  if (!forallOp)
+    return this->emitOpError("expected forall op parent");
 
-  // TODO: PerformConcurrentlyOpInterface.
+  // TODO: InParallelOpInterface.
   for (Operation &op : getRegion().front().getOperations()) {
     if (!isa<tensor::ParallelInsertSliceOp>(op)) {
       return this->emitOpError("expected only ")
@@ -1466,14 +1463,14 @@ LogicalResult PerformConcurrentlyOp::verify() {
 
     // Verify that inserts are into out block arguments.
     Value dest = cast<tensor::ParallelInsertSliceOp>(op).getDest();
-    ArrayRef<BlockArgument> regionOutArgs = foreachThreadOp.getRegionOutArgs();
+    ArrayRef<BlockArgument> regionOutArgs = forallOp.getRegionOutArgs();
     if (!llvm::is_contained(regionOutArgs, dest))
       return op.emitOpError("may only insert into an output block argument");
   }
   return success();
 }
 
-void PerformConcurrentlyOp::print(OpAsmPrinter &p) {
+void InParallelOp::print(OpAsmPrinter &p) {
   p << " ";
   p.printRegion(getRegion(),
                 /*printEntryBlockArgs=*/false,
@@ -1481,8 +1478,7 @@ void PerformConcurrentlyOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict(getOperation()->getAttrs());
 }
 
-ParseResult PerformConcurrentlyOp::parse(OpAsmParser &parser,
-                                         OperationState &result) {
+ParseResult InParallelOp::parse(OpAsmParser &parser, OperationState &result) {
   auto &builder = parser.getBuilder();
 
   SmallVector<OpAsmParser::Argument, 8> regionOperands;
@@ -1500,11 +1496,11 @@ ParseResult PerformConcurrentlyOp::parse(OpAsmParser &parser,
   return success();
 }
 
-OpResult PerformConcurrentlyOp::getParentResult(int64_t idx) {
+OpResult InParallelOp::getParentResult(int64_t idx) {
   return getOperation()->getParentOp()->getResult(idx);
 }
 
-SmallVector<BlockArgument> PerformConcurrentlyOp::getDests() {
+SmallVector<BlockArgument> InParallelOp::getDests() {
   return llvm::to_vector<4>(
       llvm::map_range(getYieldingOps(), [](Operation &op) {
         // Add new ops here as needed.
@@ -1513,7 +1509,7 @@ SmallVector<BlockArgument> PerformConcurrentlyOp::getDests() {
       }));
 }
 
-llvm::iterator_range<Block::iterator> PerformConcurrentlyOp::getYieldingOps() {
+llvm::iterator_range<Block::iterator> InParallelOp::getYieldingOps() {
   return getRegion().front().getOperations();
 }
 

diff  --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index d5c227967b36a..91a7bb42e402d 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1003,9 +1003,9 @@ struct YieldOpInterface
 };
 
 /// Return `true` if the given loop may have 0 iterations.
-bool mayHaveZeroIterations(scf::ForeachThreadOp foreachThreadOp) {
-  for (auto [lb, ub] : llvm::zip(foreachThreadOp.getMixedLowerBound(),
-                                 foreachThreadOp.getMixedUpperBound())) {
+bool mayHaveZeroIterations(scf::ForallOp forallOp) {
+  for (auto [lb, ub] : llvm::zip(forallOp.getMixedLowerBound(),
+                                 forallOp.getMixedUpperBound())) {
     std::optional<int64_t> lbConst = getConstantIntValue(lb);
     std::optional<int64_t> ubConst = getConstantIntValue(ub);
     if (!lbConst.has_value() || !ubConst.has_value() || *lbConst >= *ubConst)
@@ -1014,39 +1014,39 @@ bool mayHaveZeroIterations(scf::ForeachThreadOp foreachThreadOp) {
   return false;
 }
 
-/// Bufferization of ForeachThreadOp. This also bufferizes the terminator of the
-/// region. There are op interfaces for the terminators (PerformConcurrentlyOp
+/// Bufferization of ForallOp. This also bufferizes the terminator of the
+/// region. There are op interfaces for the terminators (InParallelOp
 /// and ParallelInsertSliceOp), but these are only used during analysis. Not
 /// for bufferization.
-struct ForeachThreadOpInterface
-    : public BufferizableOpInterface::ExternalModel<ForeachThreadOpInterface,
-                                                    ForeachThreadOp> {
+struct ForallOpInterface
+    : public BufferizableOpInterface::ExternalModel<ForallOpInterface,
+                                                    ForallOp> {
   bool bufferizesToMemoryRead(Operation *op, OpOperand &opOperand,
                               const AnalysisState &state) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    auto forallOp = cast<ForallOp>(op);
 
     // If the loop has zero iterations, the results of the op are their
     // corresponding shared_outs, meaning that the shared_outs bufferize to a
     // read.
-    if (mayHaveZeroIterations(foreachThreadOp))
+    if (mayHaveZeroIterations(forallOp))
       return true;
 
-    // scf::ForeachThreadOp alone doesn't bufferize to a memory read, one of the
+    // scf::ForallOp alone doesn't bufferize to a memory read, one of the
     // uses of its matching bbArg may.
-    return state.isValueRead(foreachThreadOp.getTiedBlockArgument(&opOperand));
+    return state.isValueRead(forallOp.getTiedBlockArgument(&opOperand));
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
-    // Outputs of scf::ForeachThreadOps are always considered as a write.
+    // Outputs of scf::ForallOps are always considered as a write.
     return true;
   }
 
   AliasingOpResultList getAliasingOpResults(Operation *op, OpOperand &opOperand,
                                             const AnalysisState &state) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
-    return {{{foreachThreadOp.getTiedOpResult(&opOperand),
-              BufferRelation::Equivalent}}};
+    auto forallOp = cast<ForallOp>(op);
+    return {
+        {{forallOp.getTiedOpResult(&opOperand), BufferRelation::Equivalent}}};
   }
 
   bool isWritable(Operation *op, Value value,
@@ -1057,12 +1057,12 @@ struct ForeachThreadOpInterface
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     OpBuilder::InsertionGuard guard(rewriter);
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
-    int64_t rank = foreachThreadOp.getRank();
+    auto forallOp = cast<ForallOp>(op);
+    int64_t rank = forallOp.getRank();
 
     // Get buffers for all output operands.
     SmallVector<Value> buffers;
-    for (Value out : foreachThreadOp.getOutputs()) {
+    for (Value out : forallOp.getOutputs()) {
       FailureOr<Value> buffer = getBuffer(rewriter, out, options);
       if (failed(buffer))
         return failure();
@@ -1070,36 +1070,34 @@ struct ForeachThreadOpInterface
     }
 
     // Use buffers instead of block arguments.
-    rewriter.setInsertionPointToStart(foreachThreadOp.getBody());
-    for (const auto &it :
-         llvm::zip(foreachThreadOp.getBody()->getArguments().drop_front(rank),
-                   buffers)) {
+    rewriter.setInsertionPointToStart(forallOp.getBody());
+    for (const auto &it : llvm::zip(
+             forallOp.getBody()->getArguments().drop_front(rank), buffers)) {
       BlockArgument bbArg = std::get<0>(it);
       Value buffer = std::get<1>(it);
       Value bufferAsTensor =
-          rewriter.create<ToTensorOp>(foreachThreadOp.getLoc(), buffer);
+          rewriter.create<ToTensorOp>(forallOp.getLoc(), buffer);
       bbArg.replaceAllUsesWith(bufferAsTensor);
     }
 
-    // Create new ForeachThreadOp without any results and drop the automatically
+    // Create new ForallOp without any results and drop the automatically
     // introduced terminator.
-    rewriter.setInsertionPoint(foreachThreadOp);
-    ForeachThreadOp newForeachThreadOp;
-    newForeachThreadOp = rewriter.create<ForeachThreadOp>(
-        foreachThreadOp.getLoc(), foreachThreadOp.getMixedLowerBound(),
-        foreachThreadOp.getMixedUpperBound(), foreachThreadOp.getMixedStep(),
-        /*outputs=*/ValueRange(), foreachThreadOp.getMapping());
+    rewriter.setInsertionPoint(forallOp);
+    ForallOp newForallOp;
+    newForallOp = rewriter.create<ForallOp>(
+        forallOp.getLoc(), forallOp.getMixedLowerBound(),
+        forallOp.getMixedUpperBound(), forallOp.getMixedStep(),
+        /*outputs=*/ValueRange(), forallOp.getMapping());
 
-    newForeachThreadOp.getBody()->getTerminator()->erase();
+    newForallOp.getBody()->getTerminator()->erase();
 
     // Move over block contents of the old op.
     SmallVector<Value> replacementBbArgs;
-    replacementBbArgs.append(
-        newForeachThreadOp.getBody()->getArguments().begin(),
-        newForeachThreadOp.getBody()->getArguments().end());
-    replacementBbArgs.append(foreachThreadOp.getOutputs().size(), Value());
-    rewriter.mergeBlocks(foreachThreadOp.getBody(),
-                         newForeachThreadOp.getBody(), replacementBbArgs);
+    replacementBbArgs.append(newForallOp.getBody()->getArguments().begin(),
+                             newForallOp.getBody()->getArguments().end());
+    replacementBbArgs.append(forallOp.getOutputs().size(), Value());
+    rewriter.mergeBlocks(forallOp.getBody(), newForallOp.getBody(),
+                         replacementBbArgs);
 
     // Remove the old op and replace all of its uses.
     replaceOpWithBufferizedValues(rewriter, op, buffers);
@@ -1110,29 +1108,29 @@ struct ForeachThreadOpInterface
   FailureOr<BaseMemRefType>
   getBufferType(Operation *op, Value value, const BufferizationOptions &options,
                 const DenseMap<Value, BaseMemRefType> &fixedTypes) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    auto forallOp = cast<ForallOp>(op);
 
     if (auto bbArg = value.dyn_cast<BlockArgument>())
       // A tensor block argument has the same bufferized type as the
       // corresponding output operand.
       return bufferization::getBufferType(
-          foreachThreadOp.getTiedOpOperand(bbArg)->get(), options, fixedTypes);
+          forallOp.getTiedOpOperand(bbArg)->get(), options, fixedTypes);
 
     // The bufferized result type is the same as the bufferized type of the
     // corresponding output operand.
     return bufferization::getBufferType(
-        foreachThreadOp.getOutputs()[value.cast<OpResult>().getResultNumber()],
+        forallOp.getOutputs()[value.cast<OpResult>().getResultNumber()],
         options, fixedTypes);
   }
 
   bool isRepetitiveRegion(Operation *op, unsigned index) const {
-    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    auto forallOp = cast<ForallOp>(op);
 
     // This op is repetitive if it has 1 or more steps.
     // If the control variables are dynamic, it is also considered so.
-    for (auto [lb, ub, step] : llvm::zip(foreachThreadOp.getMixedLowerBound(),
-                                         foreachThreadOp.getMixedUpperBound(),
-                                         foreachThreadOp.getMixedStep())) {
+    for (auto [lb, ub, step] :
+         llvm::zip(forallOp.getMixedLowerBound(), forallOp.getMixedUpperBound(),
+                   forallOp.getMixedStep())) {
       std::optional<int64_t> lbConstant = getConstantIntValue(lb);
       if (!lbConstant)
         return true;
@@ -1152,10 +1150,10 @@ struct ForeachThreadOpInterface
   }
 };
 
-/// Nothing to do for PerformConcurrentlyOp.
-struct PerformConcurrentlyOpInterface
-    : public BufferizableOpInterface::ExternalModel<
-          PerformConcurrentlyOpInterface, PerformConcurrentlyOp> {
+/// Nothing to do for InParallelOp.
+struct InParallelOpInterface
+    : public BufferizableOpInterface::ExternalModel<InParallelOpInterface,
+                                                    InParallelOp> {
   LogicalResult bufferize(Operation *op, RewriterBase &b,
                           const BufferizationOptions &options) const {
     llvm_unreachable("op does not have any tensor OpOperands / OpResults");
@@ -1174,9 +1172,8 @@ void mlir::scf::registerBufferizableOpInterfaceExternalModels(
     ExecuteRegionOp::attachInterface<ExecuteRegionOpInterface>(*ctx);
     ForOp::attachInterface<ForOpInterface>(*ctx);
     IfOp::attachInterface<IfOpInterface>(*ctx);
-    ForeachThreadOp::attachInterface<ForeachThreadOpInterface>(*ctx);
-    PerformConcurrentlyOp::attachInterface<PerformConcurrentlyOpInterface>(
-        *ctx);
+    ForallOp::attachInterface<ForallOpInterface>(*ctx);
+    InParallelOp::attachInterface<InParallelOpInterface>(*ctx);
     WhileOp::attachInterface<WhileOpInterface>(*ctx);
     YieldOp::attachInterface<YieldOpInterface>(*ctx);
   });

diff  --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
index 79a688ad9d4f7..aee10633c59f8 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
@@ -177,13 +177,12 @@ struct AffineOpSCFCanonicalizationPattern : public OpRewritePattern<OpTy> {
         }
         return failure();
       }
-      if (scf::ForeachThreadOp foreachThreadOp =
-              scf::getForeachThreadOpThreadIndexOwner(iv)) {
-        for (int64_t idx = 0; idx < foreachThreadOp.getRank(); ++idx) {
-          if (foreachThreadOp.getInductionVar(idx) == iv) {
-            lb = foreachThreadOp.getMixedLowerBound()[idx];
-            ub = foreachThreadOp.getMixedUpperBound()[idx];
-            step = foreachThreadOp.getMixedStep()[idx];
+      if (scf::ForallOp forallOp = scf::getForallOpThreadIndexOwner(iv)) {
+        for (int64_t idx = 0; idx < forallOp.getRank(); ++idx) {
+          if (forallOp.getInductionVar(idx) == iv) {
+            lb = forallOp.getMixedLowerBound()[idx];
+            ub = forallOp.getMixedUpperBound()[idx];
+            step = forallOp.getMixedStep()[idx];
             return success();
           }
         }

diff  --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
index aa6e6a1dbe051..0e3a744004003 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-empty-tensor-elimination.mlir
@@ -151,7 +151,7 @@ func.func @parallel_insert_slice(
   %f0 = arith.constant 0.0: f32
   %c512 = arith.constant 512 : index
 
-  %r1 = scf.foreach_thread (%iv) in (%c512) shared_outs(%o = %t) -> (tensor<?xf32>) {
+  %r1 = scf.forall (%iv) in (%c512) shared_outs(%o = %t) -> (tensor<?xf32>) {
     // tensor.empty itself does not alloc but forwards to the insert_slice.
     // EmptyTensorOpElimination replaces the tensor.empty with an inplace
     // extract_slice.
@@ -162,7 +162,7 @@ func.func @parallel_insert_slice(
     %f = linalg.fill ins(%f0 : f32) outs(%a : tensor<?xf32>) -> tensor<?xf32>
 
     // Self-copy canonicalizes away later.
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %f into %o[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
     }
   }

diff  --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
index 27b342200e265..84b8a78c461a5 100644
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -21,7 +21,7 @@ func.func @map_nested_foreach_to_threads_excessive_threads(%x: memref<2 x 32 x f
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -33,7 +33,7 @@ func.func @map_nested_foreach_to_threads_excessive_threads(%x: memref<2 x 32 x f
   %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -62,7 +62,7 @@ func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>,
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -74,7 +74,7 @@ func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>,
   %name2 = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -89,7 +89,7 @@ func.func @map_nested_foreach_to_threads_fewer_threads(%x: memref<2 x 32 x f32>,
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{The requested GPU threads are fewer than the number of loop trip counts. Try to tile scf.foreach_thread before mapping or set small blockDim.}}
+  // expected-error @below {{The requested GPU threads are fewer than the number of loop trip counts. Try to tile scf.forall before mapping or set small blockDim.}}
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
 }
 
@@ -101,7 +101,7 @@ func.func @map_nested_foreach_to_threads_dynamic_trip_count(%x: memref<2 x 32 x
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -135,9 +135,9 @@ func.func @map_nested_foreach_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: t
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  %foreach, %tiled = transform.structured.tile_to_foreach_thread_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
+  %foreach, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{only bufferized scf.foreach_thread lowers to gpu.thread_id}}
+  // expected-error @below {{only bufferized scf.forall lowers to gpu.thread_id}}
   transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
 }
 
@@ -167,14 +167,14 @@ func.func @map_foreach_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c900) {
+    scf.forall (%i, %j) in (%c7, %c900) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>] }
 
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
         %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
         %6 = math.fma %alpha, %4, %5 : f32
@@ -189,7 +189,7 @@ func.func @map_foreach_to_blocks_not_unique(%x: memref<2 x 32 x f32>, %y: memref
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{could not find a unique topLevel scf.foreach_thread}}
+  // expected-error @below {{could not find a unique topLevel scf.forall}}
   %1 = transform.gpu.map_foreach_to_blocks %funcop
 }
 
@@ -202,14 +202,14 @@ func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref
   %c9 = arith.constant 9 : index
   %c7 = arith.constant 7 : index
 
-  scf.foreach_thread (%i, %j) in (%c7, %c65537) {
+  scf.forall (%i, %j) in (%c7, %c65537) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
       memref.store %6, %y[%i, %j] : memref<2 x 32 x f32>
   }  { mapping = [#gpu.thread<x>, #gpu.thread<y>] }
 
-  scf.foreach_thread (%i, %j) in (%c7, %c9) {
+  scf.forall (%i, %j) in (%c7, %c9) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
@@ -222,7 +222,7 @@ func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !pdl.operation):
   %funcop = transform.structured.match ops{["func.func"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{could not find a unique topLevel scf.foreach_thread}}
+  // expected-error @below {{could not find a unique topLevel scf.forall}}
   %1 = transform.gpu.map_foreach_to_blocks %funcop { generate_gpu_launch }
 }
 
@@ -231,7 +231,7 @@ transform.sequence failures(propagate) {
 func.func @map_foreach_to_blocks_large_loop(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
   %one = arith.constant 1 : index
   %c65535 = arith.constant 65535 : index
-  scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+  scf.forall (%i, %j) in (%c65535, %c65535) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32
@@ -256,7 +256,7 @@ func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token)
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c32, %c32) {
+    scf.forall (%i, %j) in (%c32, %c32) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = arith.mulf %4, %5 : f32
@@ -300,6 +300,6 @@ func.func @tiling_buffer_semantic_op(%x: memref<32x32xf32>, %y: memref<32x32xf32
 transform.sequence failures(propagate) {
 ^bb1(%arg0: !pdl.operation):
   %matmul = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!pdl.operation) -> !pdl.operation
-  // expected-error @below {{transform.structured.tile_to_foreach_thread_op failed to apply}}
-  %foreach, %tiled = transform.structured.tile_to_foreach_thread_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
+  // expected-error @below {{transform.structured.tile_to_forall_op failed to apply}}
+  %foreach, %tiled = transform.structured.tile_to_forall_op %matmul num_threads [10, 20, 30] (mapping = [ #gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z> ] )
 }

diff  --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir
index 035a770325a43..9ae28a5e64fc8 100644
--- a/mlir/test/Dialect/GPU/transform-gpu.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu.mlir
@@ -19,7 +19,7 @@ func.func @saxpy2dblock(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
@@ -68,13 +68,13 @@ func.func @saxpy2d(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %stream : !g
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : !type
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.foreach_thread (%i) in (%c12) {
+     scf.forall (%i) in (%c12) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d
@@ -112,8 +112,8 @@ func.func @saxpy4d(%x: !type4d, %y: !type4d, %alpha : f32) -> !type4d {
 //      CHECK:   %[[TIDY:.*]] = gpu.thread_id  y
 //      CHECK:   memref.load %[[ARGX]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
 //      CHECK:   memref.load %[[ARGY]][%[[BLKX]], %[[BLKY]], %[[TIDY]], %[[TIDX]]]
-  scf.foreach_thread (%i, %j) in (%c32, %c64) {
-    scf.foreach_thread (%k, %l) in (%c4, %c32) {
+  scf.forall (%i, %j) in (%c32, %c64) {
+    scf.forall (%k, %l) in (%c4, %c32) {
       %4 = memref.load %x[%i, %j, %k, %l] : !type4d
       %5 = memref.load %y[%i, %j, %k, %l] : !type4d
       %6 = math.fma %alpha, %4, %5 : f32
@@ -146,7 +146,7 @@ func.func @saxpy2d_no_barrier(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
@@ -178,7 +178,7 @@ func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token)
 //      CHECK:   %[[TIDX:.*]] = gpu.thread_id  x
 //      CHECK:   memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]]
 //      CHECK:   memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]]
-    scf.foreach_thread (%i) in (%c32) {
+    scf.forall (%i) in (%c32) {
         %4 = memref.load %x[%i, %i] : !type
         %5 = memref.load %y[%i, %i] : !type
         %6 = arith.mulf %4, %5 : f32
@@ -211,7 +211,7 @@ func.func @saxpy3d_fold_id_z(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %s
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j, %k) in (%one, %c7, %c9) {
+    scf.forall (%i, %j, %k) in (%one, %c7, %c9) {
 //      CHECK:   memref.load %{{.*}}[%[[C0]],
 //      CHECK:   memref.load %{{.*}}[%[[C0]],
         %4 = memref.load %x[%i, %j, %k] : !type
@@ -248,13 +248,13 @@ func.func @map_multi_level(%x: !type, %y: !type, %t: !type1d, %alpha : f32, %str
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
             threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
   {
-    scf.foreach_thread (%i, %j) in (%c7, %c9) {
+    scf.forall (%i, %j) in (%c7, %c9) {
         %4 = memref.load %x[%i, %j] : !type
         %5 = memref.load %y[%i, %j] : !type
         %6 = math.fma %alpha, %4, %5 : f32
         memref.store %6, %y[%i, %j] : !type
      }  { mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-     scf.foreach_thread (%i) in (%c12) {
+     scf.forall (%i) in (%c12) {
         %7 = memref.load %t[%i] : !type1d
         %8 = arith.addf %alpha, %7 : f32
         memref.store %8, %t[%i] : !type1d

diff  --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index cfce3b3eea70e..6faa23b7f0ce7 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -880,10 +880,10 @@ func.func @reduce_dispatch_0() -> tensor<4x2xf32> {
   %c4 = arith.constant 4 : index
   %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<4x2xf32>
-  %res = scf.foreach_thread (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
+  %res = scf.forall (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) {
     %1 = tensor.empty() : tensor<1x1xf32>
     %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       //      CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}}
       // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor<f32> into tensor<4x2xf32>
       tensor.parallel_insert_slice %2 into %o[%arg0, %arg1] [1, 1] [1, 1] :

diff  --git a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
index 7b04d6c5e152b..731e8d0f53886 100644
--- a/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
+++ b/mlir/test/Dialect/Linalg/tile-to-foreach-thread.mlir
@@ -13,14 +13,14 @@ module {
 //  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor<?x?xf32>
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor<?x?xf32>
   func.func @matmul(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  //      CHECK: scf.foreach_thread ({{.*}}) in (10, 20) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
+  //      CHECK: scf.forall ({{.*}}) in (10, 20) shared_outs(%[[C_BLK:.*]] = %[[C]]) -> (tensor<?x?xf32>) {
   //      CHECK:   %[[tA:.*]] = tensor.extract_slice %[[A]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[tB:.*]] = tensor.extract_slice %[[B]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[tC:.*]] = tensor.extract_slice %[[C_BLK]]{{.*}} : tensor<?x?xf32> to tensor<?x?xf32>
   //      CHECK:   %[[RES:.*]] = linalg.matmul
   // CHECK-SAME:      ins(%[[tA]], %[[tB]] : tensor<?x?xf32>, tensor<?x?xf32>)
   // CHECK-SAME:     outs(%[[tC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-  //      CHECK:   scf.foreach_thread.perform_concurrently {
+  //      CHECK:   scf.forall.in_parallel {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[RES]] into %[[C_BLK]]{{.*}} :
   // CHECK-SAME:       tensor<?x?xf32> into tensor<?x?xf32>
   // CHECK-NEXT:   }
@@ -33,7 +33,7 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
+    %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 20] (mapping = [ #gpu.thread<y>, #gpu.thread<x> ] )
   }
 }
 
@@ -58,12 +58,12 @@ func.func @matmul_tile_size_dynamic_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?x
   //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size_1]]]
   //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map0]]()[%[[N]], %[[tile_size_2]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %tile_size_1 = "test.dummy"() : () -> (index)
   %tile_size_2 = "test.dummy"() : () -> (index)
@@ -76,7 +76,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes %sz
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes %sz
 }
 
 // -----
@@ -93,7 +93,7 @@ transform.sequence failures(propagate) {
 //  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
 func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (10, 21) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 21) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV1]])
   //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
   //  CHECK-NOT:   affine.min
@@ -104,7 +104,7 @@ func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: t
   //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
   //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
                     outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
@@ -114,7 +114,7 @@ func.func @matmul_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: t
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [10, 21]
+  %1:2 = transform.structured.tile_to_forall_op %0 num_threads [10, 21]
 }
 
 
@@ -136,7 +136,7 @@ func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C
   //      CHECK: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //      CHECK: %[[NT0:.+]] = affine.apply #map()[%[[M]]]
   //      CHECK: %[[NT1:.+]] = affine.apply #map1()[%[[N]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TS0:.+]] = affine.min #[[$map2]](%[[IV0]])[%[[M]]]
   //      CHECK:   %[[TS1:.+]] = affine.min #[[$map4]](%[[IV1]])[%[[N]]]
   //      CHECK:   %[[LB0:.+]] = affine.apply #[[$map5]](%[[IV0]])
@@ -145,7 +145,7 @@ func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
                     outs(%C : tensor<?x?xf32>) -> (tensor<?x?xf32>)
@@ -155,7 +155,7 @@ func.func @matmul_tile_size_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?xf32>, %C
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [10, 20]
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes [10, 20]
 }
 
 // -----
@@ -171,7 +171,7 @@ transform.sequence failures(propagate) {
 //  CHECK-SAME:   %[[B:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[C:[0-9a-z]+]]: tensor
 func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf32>, %C: tensor<100x300xf32>) -> tensor<100x300xf32> {
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (10, 15) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   %[[TS:.+]] = affine.min #[[$map0]](%[[IV1]])
   //  CHECK-NOT:   affine.max
   //  CHECK-NOT:   affine.min
@@ -181,7 +181,7 @@ func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf
   //      CHECK:   %[[tB:.+]] = tensor.extract_slice %[[B]][0, %[[LB1]]] [200, %[[TS]]] [1, 1] :
   //      CHECK:   %[[tC:.+]] = tensor.extract_slice %[[C_BLK]][%[[LB0]], %[[LB1]]] [10, %[[TS]]] [1, 1] :
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %0 = linalg.matmul ins(%A, %B : tensor<100x200xf32>, tensor<200x300xf32>)
                     outs(%C : tensor<100x300xf32>) -> (tensor<100x300xf32>)
@@ -191,7 +191,7 @@ func.func @matmul_tile_size_static(%A: tensor<100x200xf32>, %B: tensor<200x300xf
 transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [10, 21]
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes [10, 21]
 }
 
 // -----
@@ -213,15 +213,15 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [2] ( mapping = [#gpu.thread<x>])
+    %1:2 = transform.structured.tile_to_forall_op %0 num_threads [2] ( mapping = [#gpu.thread<x>])
   }
 }
 // CHECK-DAG: #[[$map0:.+]] = affine_map<(d0) -> (d0 * 2)>
 
 // CHECK-LABEL: extract_source(
-//       CHECK:  scf.foreach_thread (%[[ARG:.*]]) in (2) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
+//       CHECK:  scf.forall (%[[ARG:.*]]) in (2) shared_outs(%{{.*}} = %{{.*}}) -> (tensor<4xf32>) {
 //       CHECK:    %[[OFF:.*]] = affine.apply #[[$map0]](%[[ARG]])
-//       CHECK:    scf.foreach_thread.perform_concurrently {
+//       CHECK:    scf.forall.in_parallel {
 //       CHECK:      tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%[[OFF]]] [2] [1] : tensor<2xf32> into tensor<4xf32>
 
 // -----
@@ -247,12 +247,12 @@ func.func @matmul_tile_size_dynamic_dynamic(%A: tensor<?x?xf32>, %B: tensor<?x?x
   //  CHECK-DAG: %[[N:.+]] = tensor.dim %[[B]], %c1 :
   //  CHECK-DAG: %[[NT0:.+]] = affine.apply #[[$map0]]()[%[[M]], %[[tile_size]]]
   //  CHECK-DAG: %[[NT1:.+]] = affine.apply #[[$map1]]()[%[[N]]]
-  //      CHECK: scf.foreach_thread (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
+  //      CHECK: scf.forall (%[[IV0:.+]], %[[IV1:.+]]) in (%[[NT0]], %[[NT1]]) shared_outs(%[[C_BLK:.*]] = %[[C]])
   //      CHECK:   tensor.extract_slice %[[A]]
   //      CHECK:   tensor.extract_slice %[[B]]
   //      CHECK:   tensor.extract_slice %[[C_BLK]]
   //      CHECK:   linalg.matmul
-  //      CHECK:   scf.foreach_thread.perform_concurrently
+  //      CHECK:   scf.forall.in_parallel
   // CHECK-NEXT:    tensor.parallel_insert_slice
   %tile_size = "test.dummy"() : () -> (index)
   %0 = linalg.matmul ins(%A, %B : tensor<?x?xf32>, tensor<?x?xf32>)
@@ -264,7 +264,7 @@ transform.sequence failures(propagate) {
 ^bb1(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   %sz = transform.structured.match ops{["test.dummy"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %1:2 = transform.structured.tile_to_foreach_thread_op %0 tile_sizes [%sz, 20]
+  %1:2 = transform.structured.tile_to_forall_op %0 tile_sizes [%sz, 20]
 }
 
 // -----
@@ -282,7 +282,7 @@ transform.sequence failures(propagate) {
   func.func @tile_output_multi_1d_static(%IN1: tensor<100xf32>, %IN2: tensor<100xf32>,
                                          %OUT1: tensor<100xf32>, %OUT2: tensor<100xf32>)
                                          -> (tensor<100xf32>, tensor<100xf32>) {
-//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
+//      CHECK: scf.forall (%[[IV0:.+]]) in (7) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
 //      CHECK:   %[[TSMIN:.+]] = affine.min #[[$map0]](%[[IV0]])
 //      CHECK:   %[[TS:.+]] = affine.max #[[$map1]](%[[TSMIN]])
 //  CHECK-NOT:   affine.min
@@ -293,7 +293,7 @@ transform.sequence failures(propagate) {
 //      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
 //      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
 //      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
-//      CHECK:   scf.foreach_thread.perform_concurrently
+//      CHECK:   scf.forall.in_parallel
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]]] [%[[TS]]] [1] :
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [%[[TS]]] [1] :
     %res1, %res2 = linalg.generic
@@ -317,7 +317,7 @@ transform.sequence failures(propagate) {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %foreach_thread, %tiled_generic = transform.structured.tile_to_foreach_thread_op %0 num_threads [7]
+    %forall, %tiled_generic = transform.structured.tile_to_forall_op %0 num_threads [7]
   }
 
 // -----
@@ -337,14 +337,14 @@ transform.sequence failures(propagate) {
   func.func @tile_output_multi_1d2d_static(%IN1: tensor<100xf32>, %IN2: tensor<100x300xf32>, %IN3: tensor<300xf32>,
                      %OUT1: tensor<300x100xf32>, %OUT2: tensor<300xf32>)
                      -> (tensor<300x100xf32>, tensor<300xf32>) {
-//      CHECK: scf.foreach_thread (%[[IV0:.+]]) in (4) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
+//      CHECK: scf.forall (%[[IV0:.+]]) in (4) shared_outs(%[[OUT1:[0-9a-z]+]] = %[[ORGOUT1]], %[[OUT2:[0-9a-z]+]] = %[[ORGOUT2]])
 //      CHECK:   %[[LB:.+]] = affine.apply #[[$map0]](%[[IV0]])
 //      CHECK:   %[[tIN1:.+]] = tensor.extract_slice %[[IN2]][0, %[[LB]]] [100, 75]
 //      CHECK:   %[[tIN2:.+]] = tensor.extract_slice %[[IN3]][%[[LB]]] [75]
 //      CHECK:   %[[tOUT1:.+]] = tensor.extract_slice %[[OUT1]][%[[LB]], 0] [75, 100]
 //      CHECK:   %[[tOUT2:.+]] = tensor.extract_slice %[[OUT2]][%[[LB]]] [75]
 //      CHECK:   %[[RES1:[0-9]+]]:[[RES2:[0-9]+]] = linalg.generic
-//      CHECK:   scf.foreach_thread.perform_concurrently
+//      CHECK:   scf.forall.in_parallel
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#0 into %[[OUT1]][%[[LB]], 0] [75, 100]
 // CHECK-NEXT:    tensor.parallel_insert_slice %[[RES1]]#1 into %[[OUT2]][%[[LB]]] [75]
     %res2, %res3 = linalg.generic {
@@ -370,6 +370,6 @@ transform.sequence failures(propagate) {
   transform.sequence failures(propagate) {
   ^bb1(%IN_MAT2: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %IN_MAT2 : (!pdl.operation) -> !pdl.operation
-    %foreach_thread, %tiled_generic = transform.structured.tile_to_foreach_thread_op %0 num_threads [4]
+    %forall, %tiled_generic = transform.structured.tile_to_forall_op %0 num_threads [4]
   }
 

diff  --git a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
index 67ac3b432f8ce..d6b3ff3181b29 100644
--- a/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse-into-containing.mlir
@@ -16,8 +16,8 @@ module {
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} {
+    %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -28,7 +28,7 @@ module {
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
@@ -44,7 +44,7 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.fill is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -66,8 +66,8 @@ module {
     %0 = tensor.empty(%arg0) : tensor<?xf32>
     %1 = affine.apply #map0()[%arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<64xf32>) {
+    // CHECK: scf.forall {{.*}} {
+    %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %arg2) -> (tensor<64xf32>) {
       // CHECK: %[[INIT_TENSOR:.*]] = tensor.empty
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%arg0]
@@ -75,7 +75,7 @@ module {
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[INIT_TENSOR]]
       %7 = linalg.elemwise_unary ins(%0 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<64xf32>
       }
     }
@@ -87,7 +87,7 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["tensor.empty"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // tensor.empty is not tileable. The op is cloned and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -111,8 +111,8 @@ module {
     %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<?xf32>) -> tensor<?xf32>
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
 
-    // CHECK: scf.foreach_thread {{.*}} -> (tensor<?xf32>) {
-    %2 = scf.foreach_thread (%arg3) in (%d0) shared_outs(%o = %0) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} -> (tensor<?xf32>) {
+    %2 = scf.forall (%arg3) in (%d0) shared_outs(%o = %0) -> (tensor<?xf32>) {
       %5 = tensor.extract_slice %o[%arg3] [1] [1] : tensor<?xf32> to tensor<f32>
       
       // CHECK: tensor.extract_slice %{{.*}}[%{{.*}}] [1] [1] : tensor<?xf32> to tensor<1xf32>
@@ -121,7 +121,7 @@ module {
       // CHECK: func.call @foo(%{{.*}}) : (tensor<f32>) -> tensor<f32>
       %7 = func.call @foo(%5) : (tensor<f32>) -> tensor<f32>
 
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
       // CHECK: tensor.parallel_insert_slice %{{.*}} into %{{.*}}[%{{.*}}] [1] [1] : tensor<f32> into tensor<?xf32>
         tensor.parallel_insert_slice %7 into %o[%arg3] [1] [1] : tensor<f32> into tensor<?xf32>
       }
@@ -133,7 +133,7 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.fill is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -158,8 +158,8 @@ module {
     %d0 = tensor.dim %arg1, %c0 : tensor<?xf32>
     %1 = affine.apply #map0()[%d0, %arg0]
 
-    // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor<?xf32>) {
-    %2 = scf.foreach_thread (%arg3) in (%1) shared_outs(%o = %0) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} shared_outs(%[[BBARGOUT:.*]] = %[[OUT]]) -> (tensor<?xf32>) {
+    %2 = scf.forall (%arg3) in (%1) shared_outs(%o = %0) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%arg3)[%arg0]
       %4 = affine.min #map2(%arg3)[%d0, %arg0]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -170,7 +170,7 @@ module {
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary {{.*}} outs(%[[T1]]
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
@@ -181,7 +181,7 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.fill"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.fill is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1
@@ -218,8 +218,8 @@ module {
 
     %1 = affine.apply #map0()[%d0, %idx]
 
-    // CHECK: scf.foreach_thread {{.*}} {
-    %2 = scf.foreach_thread (%i) in (%1) shared_outs(%o = %out_2) -> (tensor<?xf32>) {
+    // CHECK: scf.forall {{.*}} {
+    %2 = scf.forall (%i) in (%1) shared_outs(%o = %out_2) -> (tensor<?xf32>) {
       %3 = affine.apply #map1(%i)[%idx]
       %4 = affine.min #map2(%i)[%d0, %idx]
       %5 = tensor.extract_slice %o[%3] [%4] [1] : tensor<?xf32> to tensor<?xf32>
@@ -230,7 +230,7 @@ module {
 
       // CHECK: %[[T2:.*]] = linalg.elemwise_unary ins(%[[T1]]#0
       %7 = linalg.elemwise_unary ins(%6 : tensor<?xf32>) outs(%5 : tensor<?xf32>) -> tensor<?xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %7 into %o[%3] [%4] [1] : tensor<?xf32> into tensor<?xf32>
       }
     }
@@ -241,7 +241,7 @@ module {
   transform.sequence failures(propagate) {
   ^bb1(%arg1: !pdl.operation):
     %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-    %1 = transform.structured.match ops{["scf.foreach_thread"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+    %1 = transform.structured.match ops{["scf.forall"]} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // linalg.generic is tileable. The op is tiled and fused.
     transform.structured.fuse_into_containing_op %0 into %1

diff  --git a/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
index 140ffe437a20d..032b480f50419 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-fuse.mlir
@@ -4,7 +4,7 @@
 
 module {
   // CHECK: func @foo
-  // CHECK:   scf.foreach_thread {{.*}} {
+  // CHECK:   scf.forall {{.*}} {
   // CHECK:     linalg.fill
   // CHECK:     linalg.matmul
   // CHECK:     linalg.generic
@@ -47,21 +47,21 @@ module {
     %producers = transform.structured.match attributes{"__producer__"} in %arg1 : (!pdl.operation) -> !pdl.operation
 
     // Tile the root.
-    %foreach_thread_op, %tiled_op = transform.structured.tile_to_foreach_thread_op %root num_threads [10, 20]
+    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %root num_threads [10, 20]
 
     // Fuse all producers.
-    transform.structured.fuse_into_containing_op %producers into %foreach_thread_op
+    transform.structured.fuse_into_containing_op %producers into %forall_op
   }
 }
 
 // -----
 
-// Inverse the order of the payload ops passed to the tile_to_foreach_thread_op
+// Inverse the order of the payload ops passed to the tile_to_forall_op
 // op. Fusion should still work.
 
 module {
   // CHECK: func @foo
-  // CHECK:   scf.foreach_thread {{.*}} {
+  // CHECK:   scf.forall {{.*}} {
   // CHECK:     linalg.fill
   // CHECK:     linalg.matmul
   // CHECK:     linalg.generic
@@ -105,9 +105,9 @@ module {
     %reversed_producers = transform.test_reverse_payload_ops %producers
 
     // Tile the root.
-    %foreach_thread_op, %tiled_op = transform.structured.tile_to_foreach_thread_op %root num_threads [10, 20]
+    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %root num_threads [10, 20]
 
     // Fuse all producers.
-    transform.structured.fuse_into_containing_op %reversed_producers into %foreach_thread_op
+    transform.structured.fuse_into_containing_op %reversed_producers into %forall_op
   }
 }

diff  --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
index 0c4d9f054ae39..16578bdb305ab 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -109,7 +109,7 @@ func.func @reduction_tile_parallel(
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
     by num_threads = [0, 5], tile_sizes = []
 }
 
@@ -127,7 +127,7 @@ transform.sequence failures(propagate) {
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
 // CHECK-DAG:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
@@ -139,7 +139,7 @@ transform.sequence failures(propagate) {
 //     CHECK:       arith.addf
 //     CHECK:       linalg.yield
 //     CHECK:     } -> tensor<?xf32>
-//     CHECK:     scf.foreach_thread.perform_concurrently {
+//     CHECK:     scf.forall.in_parallel {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
@@ -161,7 +161,7 @@ func.func @matmul_tile_parallel(
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
     by num_threads = [0, 0, 5], tile_sizes = []
 }
 
@@ -181,7 +181,7 @@ transform.sequence failures(propagate) {
 // CHECK-DAG:   %[[D4:.*]] = tensor.dim %[[ARG2]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D3]], %[[D4]]) : tensor<?x?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x?x5xf32>) -> tensor<?x?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x?x5xf32>) {
 // CHECK-DAG:     %[[TS0:.+]] = affine.min #[[MAP0]](%[[IV]])[%[[D1]]]
 // CHECK-DAG:     %[[TS1:.+]] = affine.max #[[MAP1]](%[[TS0]])
 // CHECK-DAG:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?x5xf32> to tensor<?x?xf32>
@@ -190,7 +190,7 @@ transform.sequence failures(propagate) {
 //     CHECK:     %[[INCHUNKB:.+]] = tensor.extract_slice %[[ARG1]][%[[TINDEX]], 0] [%[[TS1]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[TEMPEXT:.+]] = tensor.extract_slice %[[ET]][0, 0] [%[[D0]], %[[D2]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
 //     CHECK:     %[[PARTIAL:.+]] = linalg.matmul ins(%[[INCHUNKA]], %[[INCHUNKB]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[TEMPEXT]] : tensor<?x?xf32>) -> tensor<?x?xf32>
-//     CHECK:     scf.foreach_thread.perform_concurrently {
+//     CHECK:     scf.forall.in_parallel {
 //     CHECK:       tensor.parallel_insert_slice %[[PARTIAL]] into %[[ARG3]][0, 0, %[[IV]]] [%[[D0]], %[[D2]], 1] [1, 1, 1] : tensor<?x?xf32> into tensor<?x?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
@@ -220,7 +220,7 @@ func.func @reduction_tile_parallel_cyclic_dist(
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0 
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0 
     by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>]
 }
 
@@ -238,7 +238,7 @@ transform.sequence failures(propagate) {
 // CHECK-DAG:   %[[D2:.*]] = tensor.dim %[[ARG1]], %[[C0]] : tensor<?xf32>
 //     CHECK:   %[[E:.*]] = tensor.empty(%[[D2]]) : tensor<?x5xf32>
 //     CHECK:   %[[F:.*]] = linalg.fill ins(%[[I]] : f32) outs(%[[E]] : tensor<?x5xf32>) -> tensor<?x5xf32>
-//     CHECK:   %[[L:.*]] = scf.foreach_thread (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
+//     CHECK:   %[[L:.*]] = scf.forall (%[[IV:.+]]) in (5) shared_outs(%[[ARG3:.+]] = %[[F]]) -> (tensor<?x5xf32>) {
 //     CHECK:     %[[ET:.+]] = tensor.extract_slice %[[ARG3:.+]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?x5xf32> to tensor<?xf32>
 //     CHECK:     %[[D1:.*]] = tensor.dim %[[ARG0]], %[[C1]] : tensor<?x?xf32>
 //     CHECK:     %[[LB:.+]] = affine.apply #[[MAP0]]()[%[[IV]]]
@@ -255,7 +255,7 @@ transform.sequence failures(propagate) {
 //     CHECK:       %[[INS:.+]] = tensor.insert_slice %[[PARTIAL]] into %[[ACC]][0] [%[[D3]]] [1] : tensor<?xf32> into tensor<?xf32>
 //     CHECK:       scf.yield %[[INS]] : tensor<?xf32>
 //     CHECK:     }
-//     CHECK:     scf.foreach_thread.perform_concurrently {
+//     CHECK:     scf.forall.in_parallel {
 //     CHECK:       tensor.parallel_insert_slice %[[CARRY]] into %[[ARG3]][0, %[[IV]]] [%[[D0]], 1] [1, 1] : tensor<?xf32> into tensor<?x5xf32>
 //     CHECK:     }
 //     CHECK:   }
@@ -285,7 +285,7 @@ func.func @reduction_tile_parallel_cyclic_dist(
 transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0 
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0 
     by num_threads = [0, 5], tile_sizes = [0, 3], mapping = [#gpu.thread<x>]
   
   //      CHECK:     expecting fill
@@ -303,7 +303,7 @@ transform.sequence failures(propagate) {
 
 // -----
 
-func.func @reduction_untiled_foreach_thread(
+func.func @reduction_untiled_forall(
   %arg0: tensor<?x?xf32>, %out: tensor<?xf32>) -> tensor<?xf32> {
   // expected-note @below {{target operation}}
   %red = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
@@ -323,7 +323,7 @@ transform.sequence failures(propagate) {
 ^bb0(%arg1: !pdl.operation):
   %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!pdl.operation) -> !pdl.operation
   // expected-error @below {{could not tile reduction}}
-  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_foreach_thread %0
+  %loop, %1, %2, %3 = transform.structured.tile_reduction_using_forall %0
     by num_threads = [5], tile_sizes = [3], mapping = [#gpu.thread<x>]
 
 }

diff  --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 7ee88b64ea8ac..4836fae4f757e 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1486,8 +1486,8 @@ func.func @canonicalize_parallel_insert_slice_indices(
   // CHECK: %[[c1:.*]] = arith.constant 1 : index
   %c1 = arith.constant 1 : index
 
-  %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
-    scf.foreach_thread.perform_concurrently {
+  %2 = scf.forall (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %arg0 into %o[%tidx, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<?x?xf32>
     }
   }

diff  --git a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
index 2632b721cee2d..5b65c49ea6ed1 100644
--- a/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
+++ b/mlir/test/Dialect/SCF/foreach-thread-canonicalization.mlir
@@ -8,7 +8,7 @@ func.func @reduce() {
   linalg.fill ins(%cst_0 : f32) outs(%0 : memref<128x384xf32>)
   %2 = memref.alloc() : memref<128xf32>
   linalg.fill ins(%cst_0 : f32) outs(%2 : memref<128xf32>)
-  scf.foreach_thread (%arg0) in (%c2) {
+  scf.forall (%arg0) in (%c2) {
     %7 = affine.min affine_map<(d0) -> (d0 * -64 + 128, 64)>(%arg0)
     %8 = affine.max affine_map<(d0) -> (0, d0)>(%7)
     %9 = affine.apply affine_map<(d0) -> (d0 * 64)>(%arg0)

diff  --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index c1c66393a5ac8..c3c396ec808f2 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -548,9 +548,9 @@ func.func @wrong_num_results(%in: tensor<100xf32>, %out: tensor<100xf32>) {
   %num_threads = arith.constant 100 : index
 
   // expected-error @+1 {{1 operands present, but expected 2}}
-  %result:2 = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
+  %result:2 = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>, tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
@@ -564,9 +564,9 @@ func.func @invalid_insert_dest(%in: tensor<100xf32>, %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         // expected-error @+1 {{may only insert into an output block argument}}
         tensor.parallel_insert_slice %1 into %out[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
@@ -581,10 +581,10 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> (tensor<100xf32>) {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
       // expected-error @+1 {{expected only tensor.parallel_insert_slice ops}}
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
         %0 = arith.constant 1: index
@@ -598,8 +598,8 @@ func.func @wrong_terminator_op(%in: tensor<100xf32>, %out: tensor<100xf32>) {
 func.func @mismatched_mapping(%x: memref<2 x 32 x f32>, %y: memref<2 x 32 x f32>, %t: memref<32 x f32>, %alpha : f32, %stream : !gpu.async.token) -> memref<2 x 32 x f32> {
   %one = arith.constant 1 : index
   %c65535 = arith.constant 65535 : index
-  // expected-error @below {{'scf.foreach_thread' op mapping attribute size must match op rank}}
-  scf.foreach_thread (%i, %j) in (%c65535, %c65535) {
+  // expected-error @below {{'scf.forall' op mapping attribute size must match op rank}}
+  scf.forall (%i, %j) in (%c65535, %c65535) {
       %4 = memref.load %x[%i, %j] : memref<2 x 32 x f32>
       %5 = memref.load %y[%i, %j] : memref<2 x 32 x f32>
       %6 = math.fma %alpha, %4, %5 : f32

diff  --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
index b764b41877250..dee38e0936d25 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -614,7 +614,7 @@ func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
   %c0 = arith.constant 0 : index
   %cst = arith.constant -0.000000e+00 : f32
   %c320 = arith.constant 320 : index
-  %4 = scf.foreach_thread (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
+  %4 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %2) -> (tensor<320xf32>) {
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
     %5 = tensor.extract_slice %3[%arg0, 0] [1, 10240] [1, 1]  : tensor<320x10240xf32> to tensor<1x10240xf32>
     // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
@@ -624,7 +624,7 @@ func.func @same_enclosing_repetitive_region(%2: tensor<320xf32>,
     // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
     %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<1xf32>) -> tensor<1xf32>
 
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
       tensor.parallel_insert_slice %8 into %arg1[%arg0] [1] [1] : tensor<1xf32> into tensor<320xf32>
     }

diff  --git a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
index d231a5cc46e3e..e7001e5d4abe0 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-tensor-copy-insertion.mlir
@@ -108,23 +108,23 @@ func.func @scf_while_non_equiv_condition_and_body(%A: tensor<5xi1>,
 
 // -----
 
-// CHECK-LABEL: func @scf_foreach_thread_out_of_place(
+// CHECK-LABEL: func @scf_forall_out_of_place(
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<100xf32>, %[[arg1:.*]]: tensor<100xf32>
-// CHECK-FUNC-LABEL: func @scf_foreach_thread_out_of_place(
-func.func @scf_foreach_thread_out_of_place(%in: tensor<100xf32>,
+// CHECK-FUNC-LABEL: func @scf_forall_out_of_place(
+func.func @scf_forall_out_of_place(%in: tensor<100xf32>,
                                            %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
   // CHECK-FUNC-NOT: alloc_tensor
   // CHECK: %[[alloc:.*]] = bufferization.alloc_tensor() copy(%[[arg1]]) {bufferization.escape = [false]} : tensor<100xf32>
-  // CHECK: scf.foreach_thread {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
+  // CHECK: scf.forall {{.*}} shared_outs(%[[o:.*]] = %[[alloc]])
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
       // CHECK: tensor.extract_slice
-      // CHECK: scf.foreach_thread.perform_concurrently
+      // CHECK: scf.forall.in_parallel
       // CHECK: tensor.parallel_insert_slice %{{.*}} into %[[o]]
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }

diff  --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
index e37fe73d01170..7726cadd32cc3 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -543,8 +543,8 @@ func.func @parallel_insert_slice_no_conflict(
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
 
-  // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
-  %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+  // CHECK: scf.forall (%[[tidx:.*]]) in (%[[idx2]])
+  %2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       // CHECK: %[[subview:.*]] = memref.subview %[[arg2]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
       // CHECK: linalg.fill ins(%{{.*}}) outs(%[[subview]] : memref<?xf32
@@ -553,9 +553,9 @@ func.func @parallel_insert_slice_no_conflict(
       // CHECK: memref.copy %[[subview]], %[[subview]]
 
       // Empty terminator is elided from pretty-printing.
-      // CHECK-NOT: scf.foreach_thread.perform_concurrently
+      // CHECK-NOT: scf.forall.in_parallel
       // CHECK-NOT: parallel_insert_slice
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
           tensor<?xf32> into tensor<?xf32>
       }
@@ -589,8 +589,8 @@ func.func @parallel_insert_slice_with_conflict(
   // CHECK: %[[alloc1:.*]] = memref.alloc
   // CHECK: memref.copy %[[arg2]], %[[alloc1]]
 
-  // CHECK: scf.foreach_thread (%[[tidx:.*]]) in (%[[idx2]])
-  %2 = scf.foreach_thread (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
+  // CHECK: scf.forall (%[[tidx:.*]]) in (%[[idx2]])
+  %2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32>) {
       // CHECK: %[[subview1:.*]] = memref.subview %[[alloc1]][5] [%[[idx]]] [1]
       %6 = tensor.extract_slice %o[5] [%idx] [%c1] : tensor<?xf32> to tensor<?xf32>
 
@@ -601,9 +601,9 @@ func.func @parallel_insert_slice_with_conflict(
       // CHECK: memref.copy %[[subview1]], %[[subview1]]
 
       // Empty terminator is elided from pretty-printing.
-      // CHECK-NOT: scf.foreach_thread.perform_concurrently
+      // CHECK-NOT: scf.forall.in_parallel
       // CHECK-NOT: parallel_insert_slice
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
           tensor<?xf32> into tensor<?xf32>
       }
@@ -629,8 +629,8 @@ func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<
   %c2 = arith.constant 2 : index
   %c4 = arith.constant 4 : index
 
-  // CHECK: scf.foreach_thread {{.*}}
-  %0 = scf.foreach_thread (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
+  // CHECK: scf.forall {{.*}}
+  %0 = scf.forall (%arg3, %arg4) in (%c2, %c4) shared_outs(%o = %arg2) -> (tensor<8x8xf32>) {
     %1 = affine.apply #map0(%arg3)
     %3 = tensor.extract_slice %arg0[%1, 0] [4, 8] [1, 1] : tensor<8x8xf32> to tensor<4x8xf32>
     %4 = affine.apply #map1(%arg4)
@@ -639,7 +639,7 @@ func.func @matmul(%arg0: tensor<8x8xf32>, %arg1: tensor<8x8xf32>, %arg2: tensor<
 
     //      CHECK: linalg.matmul ins({{.*}}memref<4x8xf32, strided<[?, ?], offset: ?>>, memref<8x4xf32, strided<[?, ?], offset: ?>>) outs({{.*}} : memref<4x4xf32, strided<[?, ?], offset: ?>>)
     %8 = linalg.matmul ins(%3, %6 : tensor<4x8xf32>, tensor<8x4xf32>) outs(%7 : tensor<4x4xf32>) -> tensor<4x4xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %8 into %o[%1, %4] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<8x8xf32>
     }
   }
@@ -658,19 +658,19 @@ func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 {
   // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: memref.copy %[[t]], %[[t_copy]]
 
-  // CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) {
+  // CHECK: scf.forall (%{{.*}}) in (%{{.*}}) {
 
   // Load from the copy and store into the shared output.
   // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
   // CHECK:   memref.load %[[t_copy]]
   // CHECK:   memref.store %{{.*}}, %[[subview]]
-  %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
+  %0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
     %r2 = tensor.extract %t[%tid] : tensor<10xf32>
     %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %i into %o[%offset] [5] [1]
           : tensor<5xf32> into tensor<10xf32>
     }
@@ -691,8 +691,8 @@ func.func @scf_foreach_privatized_but_not_copied(
 
   // CHECK-NOT: memref.alloc
   // CHECK-NOT: memref.copy
-  // CHECK: scf.foreach_thread {{.*}} {
-  %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
+  // CHECK: scf.forall {{.*}} {
+  %0 = scf.forall (%tid) in (%c2) shared_outs(%o = %t0) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
@@ -701,7 +701,7 @@ func.func @scf_foreach_privatized_but_not_copied(
     // CHECK: memref.load %[[t1]]
     %r2 = tensor.extract %t1[%tid] : tensor<10xf32>
     %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %i into %o[%offset] [5] [1]
           : tensor<5xf32> into tensor<10xf32>
     }

diff  --git a/mlir/test/Dialect/SCF/ops.mlir b/mlir/test/Dialect/SCF/ops.mlir
index 6d79f2cd01740..2314516fa1079 100644
--- a/mlir/test/Dialect/SCF/ops.mlir
+++ b/mlir/test/Dialect/SCF/ops.mlir
@@ -311,21 +311,21 @@ func.func @execute_region() -> i64 {
   return %res : i64
 }
 
-// CHECK-LABEL: func.func @normalized_foreach_thread
-func.func @normalized_foreach_thread(%in: tensor<100xf32>, %out: tensor<100xf32>) {
+// CHECK-LABEL: func.func @normalized_forall
+func.func @normalized_forall(%in: tensor<100xf32>, %out: tensor<100xf32>) {
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  tensor.extract_slice
-  // CHECK-NEXT:  scf.foreach_thread.perform_concurrently
+  // CHECK-NEXT:  scf.forall.in_parallel
   // CHECK-NEXT:  tensor.parallel_insert_slice
   // CHECK-NEXT:  }
   // CHECK-NEXT:  }
   // CHECK-NEXT:  return
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs(%o = %out) -> tensor<100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
@@ -333,23 +333,23 @@ func.func @normalized_foreach_thread(%in: tensor<100xf32>, %out: tensor<100xf32>
   return
 }
 
-// CHECK-LABEL: func.func @explicit_loop_bounds_foreach_thread
-func.func @explicit_loop_bounds_foreach_thread(%in: tensor<100xf32>,
+// CHECK-LABEL: func.func @explicit_loop_bounds_forall
+func.func @explicit_loop_bounds_forall(%in: tensor<100xf32>,
     %out: tensor<100xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  tensor.extract_slice
-  // CHECK-NEXT:  scf.foreach_thread.perform_concurrently
+  // CHECK-NEXT:  scf.forall.in_parallel
   // CHECK-NEXT:  tensor.parallel_insert_slice
   // CHECK-NEXT:  }
   // CHECK-NEXT:  }
   // CHECK-NEXT:  return
-  %result = scf.foreach_thread (%thread_idx) =  (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
+  %result = scf.forall (%thread_idx) =  (%c0) to (%num_threads) step (%c1) shared_outs(%o = %out) -> tensor<100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         tensor.parallel_insert_slice %1 into %o[%thread_idx][1][1] :
           tensor<1xf32> into tensor<100xf32>
       }
@@ -357,32 +357,32 @@ func.func @explicit_loop_bounds_foreach_thread(%in: tensor<100xf32>,
   return
 }
 
-// CHECK-LABEL: func.func @normalized_foreach_thread_elide_terminator
-func.func @normalized_foreach_thread_elide_terminator() -> () {
+// CHECK-LABEL: func.func @normalized_forall_elide_terminator
+func.func @normalized_forall_elide_terminator() -> () {
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  } {mapping = [#gpu.thread<x>]}
   // CHECK-NEXT:  return
-  scf.foreach_thread (%thread_idx) in (%num_threads) {
-    scf.foreach_thread.perform_concurrently {
+  scf.forall (%thread_idx) in (%num_threads) {
+    scf.forall.in_parallel {
     }
   } {mapping = [#gpu.thread<x>]}
   return
 
 }
 
-// CHECK-LABEL: func.func @explicit_loop_bounds_foreach_thread_elide_terminator
-func.func @explicit_loop_bounds_foreach_thread_elide_terminator() -> () {
+// CHECK-LABEL: func.func @explicit_loop_bounds_forall_elide_terminator
+func.func @explicit_loop_bounds_forall_elide_terminator() -> () {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  //      CHECK:    scf.foreach_thread
+  //      CHECK:    scf.forall
   // CHECK-NEXT:  } {mapping = [#gpu.thread<x>]}
   // CHECK-NEXT:  return
-  scf.foreach_thread (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
-    scf.foreach_thread.perform_concurrently {
+  scf.forall (%thread_idx) = (%c0) to (%num_threads) step (%c1) {
+    scf.forall.in_parallel {
     }
   } {mapping = [#gpu.thread<x>]}
   return

diff  --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
index 2c0b87178b013..8a5e04750e7ce 100644
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -1531,12 +1531,12 @@ func.func @canonicalize_parallel_insert_slice_indices(
   %c1 = arith.constant 1 : index
 
   //  CHECK-NOT: tensor.cast
-  //      CHECK: scf.foreach_thread (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
-  // CHECK-NEXT:   scf.foreach_thread.perform_concurrently {
+  //      CHECK: scf.forall (%[[tidx:[0-9a-z]*]]) in (%[[num_threads]]) shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<?x?xf32>) {
+  // CHECK-NEXT:   scf.forall.in_parallel {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[arg0]] into %[[o]][%[[tidx]], 0] [1, 5] [1, 1]
-  %2 = scf.foreach_thread (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
+  %2 = scf.forall (%tidx) in (%num_threads) shared_outs(%o = %arg1) -> (tensor<?x?xf32>) {
     %3 = tensor.cast %arg0 : tensor<1x5xf32> to tensor<?x5xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %3 into %o[%tidx, %c0] [%c1, 5] [%c1, %c1] : tensor<?x5xf32> into tensor<?x?xf32>
     }
   }
@@ -1553,11 +1553,11 @@ func.func @dont_fold_parallel_insert_slice(
 {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
-  //      CHECK: scf.foreach_thread () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
-  // CHECK-NEXT:   scf.foreach_thread.perform_concurrently {
+  //      CHECK: scf.forall () in () shared_outs(%[[o:.*]] = %[[arg1]]) -> (tensor<1x5xf32>) {
+  // CHECK-NEXT:   scf.forall.in_parallel {
   // CHECK-NEXT:     tensor.parallel_insert_slice %[[arg0]] into %[[o]][0, 0] [1, 5] [1, 1] : tensor<1x5xf32> into tensor<1x5xf32>
-  %2 = scf.foreach_thread () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
-    scf.foreach_thread.perform_concurrently {
+  %2 = scf.forall () in () shared_outs(%o = %arg1) -> (tensor<1x5xf32>) {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %arg0 into %o[%c0, %c0] [1, 5] [%c1, %c1] : tensor<1x5xf32> into tensor<1x5xf32>
     }
   }

diff  --git a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
index f366d331dff58..3669cae87408d 100644
--- a/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
+++ b/mlir/test/Dialect/Tensor/extract-slice-from-collapse-shape.mlir
@@ -28,11 +28,11 @@ func.func @extract_slice_static(%input: tensor<3x5x7x11xf32>) -> tensor<20x11xf3
 // FOREACH-DAG: %[[c5:.+]] = arith.constant 5 : index
 // FOREACH-DAG: %[[c7:.+]] = arith.constant 7 : index
 // FOREACH-DAG: %[[init:.+]] = tensor.empty() : tensor<20x11xf32>
-//     FOREACH: %[[tile:.+]] = scf.foreach_thread (%[[iv:.+]]) in (20) shared_outs(%[[dest:.+]] = %[[init]])
+//     FOREACH: %[[tile:.+]] = scf.forall (%[[iv:.+]]) in (20) shared_outs(%[[dest:.+]] = %[[init]])
 //     FOREACH:   %[[multiIndex:.+]]:3 = affine.delinearize_index %[[iv]] into (%[[c3]], %[[c5]], %[[c7]]
 //     FOREACH:   %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex]]#0, %[[multiIndex]]#1, %[[multiIndex]]#2, 0] [1, 1, 1, 11] [1, 1, 1, 1] :
 //     FOREACH:   %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3]{{\]}} :
-//     FOREACH:   perform_concurrently
+//     FOREACH:   in_parallel
 // FOREACH-NEXT:   tensor.parallel_insert_slice %[[sliceFlat]] into %[[dest]][%[[iv]], 0] [1, 11] [1, 1] :
 //     FOREACH: return %[[tile]]
 
@@ -136,14 +136,14 @@ func.func @extract_slice_dynamic_multidim(%input: tensor<3x?x?x11x?xf32>, %offt0
 // FOREACH-DAG:     %[[d1:.+]] = tensor.dim %[[arg0]], %[[c1]] :
 // FOREACH-DAG:     %[[d2:.+]] = tensor.dim %[[arg0]], %[[c2]] :
 // FOREACH-DAG:     %[[d4:.+]] = tensor.dim %[[arg0]], %[[c4]] :
-//     FOREACH:     %[[tile1:.+]] = scf.foreach_thread (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
+//     FOREACH:     %[[tile1:.+]] = scf.forall (%[[tid1:.+]], %[[tid2:.+]]) in (%[[sz1]], %[[sz2]]) shared_outs(%[[dest:.+]] = %[[init]])
 // FOREACH-DAG:       %[[iv1:.+]] = affine.apply #[[map1]](%[[tid1]])[%[[lb1]]]
 //     FOREACH:       %[[multiIndex1:.+]]:3 = affine.delinearize_index %[[iv1]] into (%[[c3]], %[[d1]], %[[d2]]) :
 // FOREACH-DAG:       %[[iv2:.+]] = affine.apply #[[map1]](%[[tid2]])[%[[lb2]]]
 //     FOREACH:       %[[multiIndex2:.+]]:2 = affine.delinearize_index %[[iv2]] into (%[[c11]], %[[d4]]) :
 //     FOREACH:       %[[slice:.+]] = tensor.extract_slice %[[arg0]][%[[multiIndex1]]#0, %[[multiIndex1]]#1, %[[multiIndex1]]#2, %[[multiIndex2]]#0, %[[multiIndex2]]#1] [1, 1, 1, 1, 1] [1, 1, 1, 1, 1] :
 //     FOREACH:       %[[sliceFlat:.+]] = tensor.collapse_shape %[[slice]] {{\[}}[0, 1, 2], [3, 4]{{\]}} :
-//     FOREACH:       perform_concurrently
+//     FOREACH:       in_parallel
 //FOREACH-NEXT:         tensor.parallel_insert_slice %[[sliceFlat]] into %[[dest]][%[[tid1]], %[[tid2]]] [1, 1] [1, 1] :
 
 // -----

diff  --git a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
index a120b0f1a9cab..2358ddeb5b01b 100644
--- a/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
+++ b/mlir/test/Dialect/Tensor/fold-consecutive-insert-extract-slice.mlir
@@ -90,9 +90,9 @@ func.func @insert_slice_rank_reducing_dynamic_shape(
 func.func @parallel_insert_slice(%t0: tensor<1x2xf32>, %t1: tensor<f32>, %t2: tensor<1x1xf32>) -> tensor<1x2xf32> {
   %c1 = arith.constant 1 : index
   %c2 = arith.constant 2 : index
-  %r = scf.foreach_thread (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
+  %r = scf.forall (%arg2, %arg3) in (%c1, %c2) shared_outs(%arg4 = %t0) -> (tensor<1x2xf32>) {
     %inserted_slice = tensor.insert_slice %t1 into %t2[0, 0] [1, 1] [1, 1] : tensor<f32> into tensor<1x1xf32>
-    scf.foreach_thread.perform_concurrently {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %inserted_slice into %arg4[%arg2, %arg3] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x2xf32>
     }
   }

diff  --git a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
index e6256a9b9ea8d..625408dfefe21 100644
--- a/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
+++ b/mlir/test/Dialect/Tensor/fold-reassociative-reshapes.mlir
@@ -44,8 +44,8 @@ func.func @rank_reducing_parallel_insert_of_collapse_shape(
   -> tensor<?x?x?x?xf32> {
   %0 = tensor.collapse_shape %t [[0, 1], [2], [3]]
       : tensor<?x1x1x5xf32> into tensor<?x1x5xf32>
-  %1 = scf.foreach_thread (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
-    scf.foreach_thread.perform_concurrently {
+  %1 = scf.forall (%iv) in (%thr) shared_outs(%o = %d) -> (tensor<?x?x?x?xf32>) {
+    scf.forall.in_parallel {
       tensor.parallel_insert_slice %0 into %o[0, 0, 0, 0][%sz, 1, 1, 5][1, 1, 1, 1]
           : tensor<?x1x5xf32> into tensor<?x?x?x?xf32>
     }

diff  --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
index 25164a4ba870c..89c6974aa9cbf 100644
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -196,10 +196,10 @@ func.func @rank_reducing_parallel_insert_slice(%in: tensor<100xf32>, %out: tenso
   %c1 = arith.constant 1 : index
   %num_threads = arith.constant 100 : index
 
-  // CHECK: scf.foreach_thread {{.*}} {
-  %result = scf.foreach_thread (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
+  // CHECK: scf.forall {{.*}} {
+  %result = scf.forall (%thread_idx) in (%num_threads) shared_outs (%o = %out) -> tensor<200x100xf32> {
       %1 = tensor.extract_slice %in[%thread_idx][1][1] : tensor<100xf32> to tensor<1xf32>
-      scf.foreach_thread.perform_concurrently {
+      scf.forall.in_parallel {
         // CHECK: memref.subview %{{.*}}[%{{.*}}] [1] [1] : memref<100xf32, strided<[?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
         // CHECK: memref.subview %{{.*}}[1, %{{.*}}] [1, 1] [1, 1] : memref<200x100xf32, strided<[?, ?], offset: ?>> to memref<1xf32, strided<[?], offset: ?>>
         tensor.parallel_insert_slice %1 into %o[1, %thread_idx][1, 1][1, 1] :

diff  --git a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
index 6dcf6cf97339d..f9037fe6099ff 100644
--- a/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
+++ b/mlir/test/lib/Dialect/Tensor/TestTensorTransforms.cpp
@@ -82,7 +82,7 @@ struct TestTensorTransforms
   Option<bool> useForeach{
       *this, "use-foreach",
       llvm::cl::desc(
-          "Use the scf.foreach_thread operation when generating loop nests for "
+          "Use the scf.forall operation when generating loop nests for "
           "the extract_slice of collapse_shape pattern"),
       llvm::cl::init(false)};
 
@@ -247,7 +247,7 @@ struct RewriteExtractSliceFromCollapseShapeUsingScfForeach
                                 tensor::ExtractSliceFromCollapseHelper &helper,
                                 PatternRewriter &rewriter) const override {
     Location loc = op.getLoc();
-    auto foreachThreadOp = rewriter.create<scf::ForeachThreadOp>(
+    auto forallOp = rewriter.create<scf::ForallOp>(
         loc, /*numThreads=*/getAsOpFoldResult(helper.getIterationSpaceSizes()),
         /*outputs=*/dest,
         /*mapping=*/std::nullopt,
@@ -263,12 +263,12 @@ struct RewriteExtractSliceFromCollapseShapeUsingScfForeach
           auto [tile, insertParams] =
               helper.emitLoopNestBody(nestedBuilder, loc, outputIvs);
           // Insert the slice into the destination.
-          auto term = nestedBuilder.create<scf::PerformConcurrentlyOp>(loc);
+          auto term = nestedBuilder.create<scf::InParallelOp>(loc);
           nestedBuilder.setInsertionPointToStart(term.getBody());
           nestedBuilder.create<tensor::ParallelInsertSliceOp>(
               loc, tile, outputArgs[0], insertParams);
         });
-    rewriter.replaceOp(op, foreachThreadOp->getResult(0));
+    rewriter.replaceOp(op, forallOp->getResult(0));
     return success();
   }
 };

diff  --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index e7696033980fe..10625d3fa523c 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -188,7 +188,7 @@ def testTileExplicitLoopTypeAll():
                                   [], transform.AnyOpType.get())
   types = [
       transform.OperationType.get(x)
-      for x in ["scf.for", "scf.parallel", "scf.foreach_thread"]
+      for x in ["scf.for", "scf.parallel", "scf.forall"]
   ]
   with InsertionPoint(sequence.body):
     structured.TileOp(types, sequence.bodyTarget, sizes=[2, 3, 4])
@@ -196,7 +196,7 @@ def testTileExplicitLoopTypeAll():
   # CHECK-LABEL: TEST: testTileExplicitLoopTypeAll
   # CHECK: = transform.structured.tile
   # CHECK-SAME : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">,
-  # CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.foreach_thread">
+  # CHECK-SAME: !transform.op<"scf.parallel">, !transform.op<"scf.forall">
 
 @run
 def testVectorize():