[Mlir-commits] [mlir] 171a5a7 - [mlir][Linalg] Add a greedy transform to map copies to threads efficiently.

Mon Jul 10 09:16:10 PDT 2023

Author: Nicolas Vasilache
Date: 2023-07-10T16:11:04Z
New Revision: 171a5a761d24664786f65c861d083df306532a35

URL: https://github.com/llvm/llvm-project/commit/171a5a761d24664786f65c861d083df306532a35
DIFF: https://github.com/llvm/llvm-project/commit/171a5a761d24664786f65c861d083df306532a35.diff

LOG: [mlir][Linalg] Add a greedy transform to map copies to threads efficiently.

This revision adds a new transformation to map a copy operation to a gpu grid of threads.
It implements a first heuristic that allows trading off coalesced accesses vs predication and occupancy.

Differential Revision: https://reviews.llvm.org/D154836

Added: 
    mlir/include/mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h
    mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp
    mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir

Modified: 
    mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
    mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
    mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
    mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h
new file mode 100644
index 00000000000000..f9fd32c2031afb

--- /dev/null
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h
@@ -0,0 +1,127 @@
+//===- GPUHeuristics.h - GPU heuristics for Linalg transforms ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
+#define MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+namespace transform {
+namespace gpu {
+
+/// Base struct to hold GPU mapping information for a given operation.
+struct MappingInfo {
+  /// Number of threads to use for the mapping.
+  /// Note: When the number of threads used is smaller than the total number of
+  /// available threads, predication ensues. It is often useful to use more
+  /// threads and saturate memory bandwidth for some operations, even if others
+  /// end up being predicated.
+  SmallVector<int64_t> numThreads;
+
+  /// Thread mapping attributes, one per entry of `numThreads`.
+  SmallVector<Attribute> threadMapping;
+};
+
+struct CopyMappingInfo : public MappingInfo {
+  /// Status of the mapping computation, invalid usually means too many threads
+  /// are required and we fail to map. This usually happens when the copy is too
+  /// large compared to the number of threads.
+  enum class Status { Success = 0, RequiresPredication, Invalid };
+
+  /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
+  /// elements of bitwidth `elementalBitwidth`.
+  /// The `desiredBitAlignment` is the number of elements by which the most
+  /// minor dimension of the copy is expected to be aligned.
+  /// This is an approximation of the final alignment, for each row of the copy.
+  /// This is used to restrict the size of copied vector so that they match
+  /// potential subsequent cp.async.
+  /// If the alignment does not match the required alignment for a cp.async down
+  /// the line, the conversion to cp.async will be eventually skipped, possibly
+  /// degrading performance.
+  /// When `favorPredication` is false, the mapping is computed to fill all
+  /// threads with an equal amount of data to copy, so as to avoid predication.
+  /// Predication ends up requiring a split epilogue in current pipelining
+  /// implementations and is better avoided when possible.
+  CopyMappingInfo(MLIRContext *ctx, int totalNumThreads,
+                  int64_t desiredBitAlignment, ArrayRef<int64_t> sizes,
+                  bool favorPredication = false,
+                  int64_t elementalBitwidth = 32);
+
+private:
+  /// Determine the maximal vector size to use to copy a contiguous array of
+  /// `numContiguousElements`, each of bitwidth `elementalBitwidth`.
+  /// The `alignment` is the number of elements by which the most minor
+  /// dimension of the copy is aligned. This is an approximation of actual
+  /// memory alignment after bufferization, for each row of the copy. This is
+  /// used to restrict the of the copied vector so that it is properly aligned
+  /// with the requirements of cp.async. If the copy alignment does not match
+  /// the required aligned for a cp.async, thae conversion to cp.async will be
+  /// skipped.
+  /// Asserts that `elementalBitwidth` divides `numContiguousElements`.
+  static int64_t
+  maxContiguousElementsToTransfer(int64_t alignment,
+                                  int64_t numContiguousElements,
+                                  int64_t elementalBitwidth = 32);
+
+  /// Compute the number of threads to use to perform a copy of `sizes`
+  /// elements of `elementalBitwidth`.
+  /// The `alignment` is the number of elements by which the most minor
+  /// dimension of the copy is aligned. This is an approximation of actual
+  /// memory alignment after bufferization, for each row of the copy. This is
+  /// used to restrict the of the copied vector so that it is properly aligned
+  /// with the requirements of cp.async. If the copy alignment does not match
+  /// the required aligned for a cp.async, the conversion to cp.async will be
+  /// skipped.
+  /// When `favorPredication` is false, the implementation avoids predication
+  /// in the copy, even if it means reducing the granularity of the transfer.
+  /// Otherwise, the implementation will come up with a maximal assignment of
+  /// the remaining threads to sizes of interest, using a DP implementation.
+  Status inferNumThreads(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
+                         int64_t desiredVectorSize, bool favorPredication);
+  Status inferNumThreadsImpl(int64_t totalNumThreads, ArrayRef<int64_t> sizes,
+                             int64_t desiredVectorSize);
+
+public:
+  // Pretty-printing and diagnostic methods.
+  void print(llvm::raw_ostream &os) const;
+  LLVM_DUMP_METHOD void dump() const;
+
+  /// Static quantity determining the number of bits to target in an individual
+  /// copy. Assumes that smaller increments of 64, 32, 16, 8 are also valid
+  /// transfer sizes. In the future we should have more hardware pluggability
+  /// here, especially when we want sub-byte granularity
+  static constexpr int64_t kMaxVectorLoadBitWidth = 128;
+
+  /// Most minor vector size (i.e. 1-D), in number of elements, used in a copy.
+  int64_t vectorSize;
+
+  /// Number of threads to use for the copy mapping, from most major to most
+  /// minor dims (i.e. numThreads.back() should be mapped to contiguous threads
+  /// for best coalescing).
+  using MappingInfo::numThreads;
+
+  /// Explicit computation / injection of the smallest bounding tile sizes after
+  /// mapping to `numThreads`. This is useful in masked scenarios.
+  SmallVector<int64_t> smallestBoundingTileSizes;
+
+  /// Thread mapping attributes, one per entry of `numThreads`.
+  using MappingInfo::threadMapping;
+
+  /// The status of a particular copy mapping. Must be checked before applying
+  /// transformations.
+  Status status;
+};
+
+} // namespace gpu
+} // namespace transform
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LINALG_TRANSFORMOPS_GPUHEURISTICS_H

diff  --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
index 797c6945c3cbff..d71f254d1dcbc7 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h
@@ -22,6 +22,7 @@ class TilingInterface;
 class RewriterBase;
 
 namespace linalg {
+class CopyOp;
 struct ForallTilingResult;
 class GenericOp;
 class LinalgOp;

diff  --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 086a37452debf0..82a1fcc5c8f4a4 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -2261,4 +2261,60 @@ def InsertSliceToCopyOp :
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// MapCopyToThreadsOp
+//===----------------------------------------------------------------------===//
+
+def MapCopyToThreadsOp :
+  Op<Transform_Dialect, "structured.gpu.map_copy_to_threads",
+    [FunctionalStyleTransformOpTrait, 
+     MemoryEffectsOpInterface,
+     TransformEachOpTrait, 
+     TransformOpInterface]> {
+  let description = [{
+    Targeted mapping of a copy operation on tensors to a GPU thread mapping.
+
+    This operation implements a greedy heuristic that determines a good 
+    distribution of threads to break down the copy operation into.
+    The heuristic is driven by considerations related to the underlying 
+    architecture for which good high-level decisions are needed assuming certain
+    hardware features. Relevant features are exposed via first-class attributes
+    to control the behavior of the transformation at a high level.
+
+    For now, a single heuristic is implemented and can be extended on a per-need
+    basis.
+
+    #### Return modes:
+
+    The operation always succeeds and returns a handle to the relevant tiled
+    linalg.copy op.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target,
+                       I64Attr:$total_num_threads,
+                       I64Attr:$desired_bit_alignment);
+  let results = (outs TransformHandleTypeInterface:$transformed);
+
+  let assemblyFormat = [{
+    $target
+    `total_num_threads` `=` $total_num_threads
+    `desired_bit_alignment` `=` $desired_bit_alignment
+    attr-dict 
+    `:` functional-type(operands, results)
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$target)>,
+  ];
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::linalg::CopyOp copyOp,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+
+    ::llvm::SmallVector<::mlir::OpFoldResult> getMixedNumThreads();
+  }];
+}
+
 #endif // LINALG_TRANSFORM_OPS

diff  --git a/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
index 129863608d01c5..bef72c3283ca7f 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/TransformOps/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_dialect_library(MLIRLinalgTransformOps
   DialectExtension.cpp
+  GPUHeuristics.cpp
   LinalgMatchOps.cpp
   LinalgTransformOps.cpp
   Syntax.cpp

diff  --git a/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp b/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp
new file mode 100644
index 00000000000000..e4fa8d6bc74b74
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/TransformOps/GPUHeuristics.cpp
@@ -0,0 +1,267 @@
+//===- GPUHeuristics.cpp - Heuristics Implementation for Transforms -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"
+
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cmath>
+#include <numeric>
+
+using namespace mlir;
+
+#define DEBUG_TYPE "linalg-transforms"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+static Attribute linearIdX(MLIRContext *ctx) {
+  return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimX);
+}
+static Attribute linearIdY(MLIRContext *ctx) {
+  return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimY);
+}
+static Attribute linearIdZ(MLIRContext *ctx) {
+  return gpu::GPULinearIdMappingAttr::get(ctx, gpu::LinearId::DimZ);
+}
+
+transform::gpu::CopyMappingInfo::CopyMappingInfo(MLIRContext *ctx,
+                                                 int totalNumThreads,
+                                                 int64_t desiredBitAlignment,
+                                                 ArrayRef<int64_t> copySizes,
+                                                 bool favorPredication,
+                                                 int64_t elementalBitwidth) {
+  assert(!copySizes.empty() && copySizes.size() <= 3 &&
+         "only 1,2,3-D copies are supported for now");
+
+  LDBG("START CopyMappingInfo, favorPredication: " << favorPredication);
+  LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy shape: ");
+             llvm::dbgs() << "\n";);
+
+  // Greedily find the largest vector size that can be used to copy the most
+  // minor dimension: we are in the business of filling kMaxVectorLoadBitWidth
+  // contiguous memory transactions with as few threads as possible.
+  int64_t desiredVectorSize = CopyMappingInfo::maxContiguousElementsToTransfer(
+      desiredBitAlignment, copySizes.back(), elementalBitwidth);
+
+  LDBG("--greedily determined vectorSize: "
+       << desiredVectorSize << " elements of " << elementalBitwidth
+       << "b each -> " << (desiredVectorSize * elementalBitwidth)
+       << "b total out of a max of " << kMaxVectorLoadBitWidth << "b");
+
+  status = inferNumThreads(totalNumThreads, copySizes, desiredVectorSize,
+                           favorPredication);
+  if (status == Status::Invalid)
+    return;
+
+  LLVM_DEBUG(llvm::interleaveComma(copySizes, DBGS() << "--copy: ");
+             llvm::dbgs() << "\n"; llvm::interleaveComma(
+                 this->numThreads, DBGS() << "--numThreads: ");
+             llvm::dbgs() << "\n";);
+  LDBG("--vectorSize: " << this->vectorSize);
+  assert(this->numThreads.size() == copySizes.size() &&
+         "compute copy mapping expected same number of threads and copy sizes");
+
+  // Compute the smallest bounding box.
+  this->smallestBoundingTileSizes = llvm::to_vector(
+      llvm::map_range(llvm::zip(copySizes, this->numThreads), [](auto &&pair) {
+        int64_t size, numThreads;
+        std::tie(size, numThreads) = pair;
+        return mlir::ceilDiv(size, numThreads);
+      }));
+  SmallVector<Attribute> allThreadMappings{linearIdZ(ctx), linearIdY(ctx),
+                                           linearIdX(ctx)};
+
+  // Set the thread mapping.
+  this->threadMapping =
+      llvm::to_vector(ArrayRef(allThreadMappings)
+                          .take_back(this->smallestBoundingTileSizes.size()));
+  LLVM_DEBUG(this->print(DBGS()); llvm::dbgs() << "\n");
+}
+
+int64_t transform::gpu::CopyMappingInfo::maxContiguousElementsToTransfer(
+    int64_t desiredBitAlignment, int64_t numContiguousElements,
+    int64_t elementalBitwidth) {
+  assert(kMaxVectorLoadBitWidth % elementalBitwidth == 0 &&
+         "elemental bitwidth does not divide kMaxVectorLoadBitWidth");
+  assert(desiredBitAlignment % elementalBitwidth == 0 &&
+         "elemental bitwidth does not divide desired bit alignment");
+  return std::gcd(
+      std::gcd(desiredBitAlignment / elementalBitwidth, numContiguousElements),
+      kMaxVectorLoadBitWidth / elementalBitwidth);
+}
+
+/// Get the list of all factors that divide `val`, not just the prime factors.
+static SmallVector<int64_t> getFactors(int64_t val) {
+  SmallVector<int64_t> factors;
+  factors.reserve(val);
+  for (int64_t factor = 1; factor <= val; ++factor) {
+    if (val % factor != 0)
+      continue;
+    factors.push_back(factor);
+  }
+  factors.push_back(val);
+  return factors;
+}
+
+static int64_t product(ArrayRef<int64_t> vals) {
+  int64_t res = 1;
+  for (auto val : vals)
+    res *= val;
+  return res;
+}
+
+/// Extract `result` from `sizes` with the following constraints:
+///   1. sizes[i] % result[i] for all i
+///   2. product_of_threadsPerDim <= maxNumThreads
+///   3. if `currentIndex` is sizes.size() - 1, then threadsPerDim[currentIndex]
+///      must be sizes[currentIndex].
+/// This is used to greedily extract the maximum number of threads usable for
+/// mapping a copy of size `sizes`, while being bounded by `totalNumThreads` and
+/// ensuring coalesced access along the most minor dimension.
+/// Return the number of threads used in the range:
+///   threadsPerDim[currentIndex .. sizes.end()]
+// The implementation uses a dynamic programming approach to greedily extract
+// the best combination under the constraints.
+// TODO: Implementation details can be improved but putting effort there is a
+// tradeoffs: `sizes` is expected to be of small rank and contain small values.
+static SmallVector<int64_t> maximizeNumThreads(ArrayRef<int64_t> sizes,
+                                               int64_t currentIndex,
+                                               int64_t maxNumThreads) {
+  assert(static_cast<size_t>(currentIndex) < sizes.size() &&
+         "currentIndex out of bounds");
+  std::string indent(2 * currentIndex, '-');
+  if (static_cast<size_t>(currentIndex) == sizes.size() - 1) {
+    LDBG(indent << "mandated globalBest: " << sizes[currentIndex]);
+    return SmallVector<int64_t>{sizes[currentIndex]};
+  }
+
+  int64_t best = 0;
+  int64_t s = sizes[currentIndex];
+  SmallVector<int64_t> factors = getFactors(s);
+  SmallVector<int64_t> localThreadsPerDim;
+  localThreadsPerDim.reserve(sizes.size());
+  LDBG(indent << "maximizeNumThreads in " << s
+              << " with limit: " << maxNumThreads);
+  for (auto factor : factors) {
+    auto nestedThreadsPerDim =
+        maximizeNumThreads(sizes, currentIndex + 1, maxNumThreads / factor);
+    int64_t localBest = factor * product(nestedThreadsPerDim);
+    if (localBest > best && localBest <= maxNumThreads) {
+      LDBG(indent << "new localBest: " << localBest);
+      LLVM_DEBUG(
+          llvm::interleaveComma(nestedThreadsPerDim,
+                                DBGS() << indent << "nestedThreadsPerDim: ");
+          llvm::dbgs() << "\n";);
+      localThreadsPerDim.clear();
+      localThreadsPerDim.push_back(factor);
+      llvm::append_range(localThreadsPerDim, nestedThreadsPerDim);
+      best = localBest;
+    }
+  }
+
+  LDBG(indent << "found globalBest: " << best);
+  LLVM_DEBUG(llvm::interleaveComma(localThreadsPerDim,
+                                   DBGS() << indent << "numThreads: ");
+             llvm::dbgs() << "\n";);
+
+  return localThreadsPerDim;
+}
+
+transform::gpu::CopyMappingInfo::Status
+transform::gpu::CopyMappingInfo::inferNumThreads(int64_t totalNumThreads,
+                                                 ArrayRef<int64_t> sizes,
+                                                 int64_t desiredVectorSize,
+                                                 bool favorPredication) {
+
+  if (!favorPredication) {
+    int64_t localVectorSize = desiredVectorSize;
+    for (; localVectorSize >= 1; localVectorSize /= 2) {
+      // Attempt to map the copy with predication and current fixed vector size:
+      //   1. if the status is Success, we are done.
+      //   2. if the status is Invalid, we fail immediately, no amount of
+      //   vector size reduction can offset the bad tile size selection from the
+      //   higher-level.
+      //   3. if the status is RequiresPredication, we try again with a smaller
+      //   vector size.
+      Status status =
+          inferNumThreadsImpl(totalNumThreads, sizes, localVectorSize);
+      if (status == Status::Success || status == Status::Invalid)
+        return status;
+
+      LDBG("requires predication, try reducing vector size to "
+           << (localVectorSize / 2));
+    }
+  }
+
+  // If we have not yet returned, it means that we have tried all vector sizes
+  // and we still require predication. Restart from the original vector size and
+  // do not attempt to
+  return inferNumThreadsImpl(totalNumThreads, sizes, desiredVectorSize);
+}
+
+transform::gpu::CopyMappingInfo::Status
+transform::gpu::CopyMappingInfo::inferNumThreadsImpl(
+    int64_t totalNumThreads, ArrayRef<int64_t> sizes,
+    int64_t desiredVectorSize) {
+  assert(sizes.back() % desiredVectorSize == 0 &&
+         "most-minor size not divisible by actualVectorSize");
+
+  LDBG("inferNumThreadsImpl with totalNumThreads: "
+       << totalNumThreads << " and vectorSize: " << desiredVectorSize);
+
+  // Scale the most minor size to account for the chosen vector size and
+  // maximize the number of threads without exceeding the total number of
+  // threads.
+  SmallVector<int64_t> scaledSizes{sizes};
+  scaledSizes.back() /= desiredVectorSize;
+  if (scaledSizes.back() > totalNumThreads) {
+    LDBG("--Too few threads given the required vector size -> FAIL");
+    return Status::Invalid;
+  }
+  SmallVector<int64_t> inferredNumThreads =
+      maximizeNumThreads(scaledSizes, 0, totalNumThreads);
+
+  LLVM_DEBUG(llvm::interleaveComma(inferredNumThreads,
+                                   DBGS() << "inferred numThreads: ");
+             llvm::dbgs() << "\n";
+             LDBG("computed actualVectorSize: " << desiredVectorSize););
+
+  // Corner case: we cannot use more threads than available. If the dimension of
+  // the copy is so bad it is because higher-level tiling did not do its job, we
+  // do not try to recover from it here.
+  int64_t totalNumThreadsUsed = product(inferredNumThreads);
+  LDBG("--totalNumThreadsUsed: " << totalNumThreadsUsed);
+  if (totalNumThreadsUsed == 0 || totalNumThreadsUsed > totalNumThreads) {
+    LDBG("--Too few threads given the required vector size -> FAIL");
+    return Status::Invalid;
+  }
+
+  this->vectorSize = desiredVectorSize;
+  this->numThreads = inferredNumThreads;
+  if (totalNumThreadsUsed == totalNumThreads)
+    return Status::Success;
+
+  return Status::RequiresPredication;
+}
+
+void transform::gpu::CopyMappingInfo::print(llvm::raw_ostream &os) const {
+  os << "MappingInfo{";
+  os << "CopyMappingInfo: ";
+  os << "valid: " << (status != Status::Invalid) << ", ";
+  os << "vectorSize: " << vectorSize << ", ";
+  llvm::interleaveComma(numThreads, os << ", numThreads: {");
+  llvm::interleaveComma(smallestBoundingTileSizes,
+                        os << "}, smallestBoundingTileSizes: {");
+  llvm::interleaveComma(threadMapping, os << "}, threadMapping: {");
+  os << "}}";
+}

diff  --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index eed58bbbf1e18c..cf12e78145fd77 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/TransformOps/GPUHeuristics.h"
 #include "mlir/Dialect/Linalg/TransformOps/Syntax.h"
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
@@ -31,6 +32,7 @@
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/VectorRewritePatterns.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/TilingInterface.h"
@@ -1700,10 +1702,10 @@ transform::PromoteOp::applyToOne(transform::TransformRewriter &rewriter,
     if (mapping.size() > 1)
       return emitDefaultDefiniteFailure(target);
 
-    auto addressSpace = cast<gpu::GPUMemorySpaceMappingAttr>(mapping[0]);
+    auto addressSpace = cast<mlir::gpu::GPUMemorySpaceMappingAttr>(mapping[0]);
 
     if (addressSpace.getAddressSpace() ==
-        gpu::GPUDialect::getWorkgroupAddressSpace()) {
+        mlir::gpu::GPUDialect::getWorkgroupAddressSpace()) {
       promotionOptions =
           promotionOptions
               .setAllocationDeallocationFns(allocateWorkgroupMemory,
@@ -1711,7 +1713,7 @@ transform::PromoteOp::applyToOne(transform::TransformRewriter &rewriter,
               .setCopyInOutFns(copyToWorkgroupMemory, copyToWorkgroupMemory)
               .setUseFullTileBuffers({false, false});
     } else if (addressSpace.getAddressSpace() ==
-               gpu::GPUDialect::getPrivateAddressSpace()) {
+               mlir::gpu::GPUDialect::getPrivateAddressSpace()) {
       promotionOptions =
           promotionOptions
               .setAllocationDeallocationFns(allocateGPUPrivateMemory,
@@ -3211,6 +3213,72 @@ DiagnosedSilenceableFailure transform::InsertSliceToCopyOp::applyToOne(
   return diag;
 }
 
+//===----------------------------------------------------------------------===//
+// MapCopyToThreadsOp
+//===----------------------------------------------------------------------===//
+DiagnosedSilenceableFailure transform::MapCopyToThreadsOp::applyToOne(
+    transform::TransformRewriter &rewriter, linalg::CopyOp copyOp,
+    transform::ApplyToEachResultList &results,
+    transform::TransformState &state) {
+  auto transformOp = cast<TransformOpInterface>(getOperation());
+  ShapedType resultShapedType;
+  if (copyOp) {
+    resultShapedType =
+        cast<ShapedType>(copyOp.getDpsInitOperand(0)->get().getType());
+  }
+  if (!copyOp || !resultShapedType.hasStaticShape()) {
+    DiagnosedSilenceableFailure diag =
+        transformOp.emitSilenceableError()
+        << "only statically sized linalg.copy ops of rank <= 3 are supported";
+    diag.attachNote(copyOp->getLoc()) << "target op";
+    return diag;
+  }
+
+  // Conservatively set the minimum viable desired bitwidth alignment.
+  int64_t desiredBitAlignment = getDesiredBitAlignment();
+  int64_t eltBitwidth =
+      resultShapedType.getElementType().getIntOrFloatBitWidth();
+  if (desiredBitAlignment % eltBitwidth != 0) {
+    desiredBitAlignment = eltBitwidth;
+  }
+
+  gpu::CopyMappingInfo mapping(
+      /*ctx=*/getContext(),
+      /*totalNumThreads=*/getTotalNumThreads(),
+      /*alignment=*/desiredBitAlignment,
+      /*sizes=*/resultShapedType.getShape(),
+      /*favorPredication=*/false,
+      /*elementalBitwidth=*/
+      resultShapedType.getElementType().getIntOrFloatBitWidth());
+  if (mapping.status == gpu::CopyMappingInfo::Status::Invalid) {
+    DiagnosedSilenceableFailure diag =
+        transformOp.emitSilenceableError()
+        << "too few threads to map copy op to threads on the most minor "
+           "dimension, given alignment and vector size constraints, try "
+           "smaller tile size of mapping to more threads";
+    diag.attachNote(copyOp->getLoc()) << "target op";
+    return diag;
+  }
+
+  // OpBuilder only used to compute attributes.
+  OpBuilder b(getContext());
+  linalg::ForallTilingResult tilingResult;
+  DiagnosedSilenceableFailure diag = tileToForallOpImpl(
+      /*rewriter=*/rewriter,
+      /*state=*/state,
+      /*transformOp=*/transformOp,
+      /*target=*/copyOp,
+      /*mixedNumThreads=*/getMixedValues(mapping.numThreads, {}, b),
+      /*mixedTileSizes=*/ArrayRef<OpFoldResult>{},
+      /*mapping=*/b.getArrayAttr(mapping.threadMapping),
+      /*tilingResult=*/tilingResult);
+  if (!diag.succeeded())
+    return diag;
+
+  results.push_back(tilingResult.tiledOp);
+  return DiagnosedSilenceableFailure::success();
+}
+
 #include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOpsEnums.cpp.inc"
 
 #define GET_OP_CLASSES

diff  --git a/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir b/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir
new file mode 100644
index 00000000000000..d822e2fac8051d
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-gpu-map-copy-to-threads.mlir
@@ -0,0 +1,407 @@
+// RUN: mlir-opt -test-transform-dialect-interpreter -split-input-file -verify-diagnostics -allow-unregistered-dialect %s | FileCheck %s
+
+
+!tt = tensor<8xf16>
+
+// CHECK-LABEL: func @copy_1d_8xf16
+func.func @copy_1d_8xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Too little data for all threads, needs predication, while keeping most
+  /// minor transfer size -> 1 thread.
+  // CHECK: scf.forall {{.*}} in (1) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<8xf16>
+  // CHECK: {mapping = [#gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<16xf16>
+
+// CHECK-LABEL: func @copy_1d_16xf16
+func.func @copy_1d_16xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Too little data for all threads, needs predication, while keeping most
+  /// minor transfer size -> 2 threads.
+  // CHECK: scf.forall {{.*}} in (2) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<8xf16>
+  // CHECK: {mapping = [#gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<20xf16>
+
+// CHECK-LABEL: func @copy_1d_20xf16
+func.func @copy_1d_20xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Too little data for all threads, needs predication, while keeping most
+  /// minor transfer size -> 5 threads.
+  // CHECK: scf.forall {{.*}} in (5) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<4xf16>
+  // CHECK: {mapping = [#gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+
+// -----
+
+!tt = tensor<20xf16>
+
+// CHECK-LABEL: func @copy_1d_20xf16
+func.func @copy_1d_20xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Too little data for all threads, needs predication, while keeping most
+  /// minor transfer size -> 5 threads.
+  // CHECK: scf.forall {{.*}} in (5) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<4xf16>
+  // CHECK: {mapping = [#gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<128xf16>
+
+// CHECK-LABEL: func @copy_1d_128xf16
+func.func @copy_1d_128xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Enough data for all threads and no need for predication but we must reduce
+  /// the transfer size to 4xf16.
+  // CHECK: scf.forall {{.*}} in (32) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<4xf16>
+  // CHECK: {mapping = [#gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<256xf16>
+
+// CHECK-LABEL: func @copy_1d_256xf16
+func.func @copy_1d_256xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Enough data for all threads and no need for predication.
+  // CHECK: scf.forall {{.*}} in (32) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<8xf16>
+  // CHECK: {mapping = [#gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<16x32x64xi8>
+
+// CHECK-LABEL: func @copy_3d_16x32x64xi8
+func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt {
+  // CHECK: scf.forall {{.*}} in (1, 8, 4) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<16x4x16xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<16x32x64xi8>
+
+// CHECK-LABEL: func @copy_3d_16x32x64xi8
+func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt {
+  // CHECK: scf.forall {{.*}} in (1, 4, 8) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<16x8x8xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 64
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<4x8x16xi8>
+
+// CHECK-LABEL: func @copy_3d_4x8x16xi8
+func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt {
+  // CHECK: scf.forall {{.*}} in (4, 8, 1) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<1x1x16xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<4x8x16xi8>
+
+// CHECK-LABEL: func @copy_3d_4x8x16xi8
+func.func @copy_3d_4x8x16xi8(%t0: !tt, %out: !tt) -> !tt {
+  // CHECK: scf.forall {{.*}} in (1, 2, 16) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<4x4x1xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 8
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<3x5x7xi8>
+
+// CHECK-LABEL: func @copy_3d_3x5x7xi8
+func.func @copy_3d_3x5x7xi8(%t0: !tt, %out: !tt) -> !tt {
+  // Best effort greedy mapping: first 7, then skip 5 (as 7*5 overflows 32), then
+  // take 3.
+  // DP mapping: 7 mandated most minor, then skip 5  (as 7*5 overflows 32), then
+  // take 3.
+  // CHECK: scf.forall {{.*}} in (3, 1, 7) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<1x5x1xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 8
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<16x15x5xi8>
+
+// CHECK-LABEL: func @copy_3d_16x15x5xi8
+func.func @copy_3d_16x15x5xi8(%t0: !tt, %out: !tt) -> !tt {
+  // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost.
+  // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<2x5x1xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 128 desired_bit_alignment = 8
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<16x15x40xi8>
+
+// CHECK-LABEL: func @copy_3d_16x15x40xi8
+func.func @copy_3d_16x15x40xi8(%t0: !tt, %out: !tt) -> !tt {
+  // DP mapping: 5 mandated most minor, then 3 to allow 8 on the outermost.
+  // CHECK: scf.forall {{.*}} in (8, 3, 5) {{.*}}
+  // CHECK:   linalg.copy {{.*}} -> tensor<2x5x8xi8>
+  // CHECK: {mapping = [#gpu.linear<z>, #gpu.linear<y>, #gpu.linear<x>]}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 128 desired_bit_alignment = 64
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+
+////////////////////////////////////////////////////////////////////////////////
+// Tests below are expected to fail.
+////////////////////////////////////////////////////////////////////////////////
+
+// -----
+
+!tt = tensor<1024xf16>
+
+// NO-CHECK-LABEL-ON-EXPECTED-ERROR
+func.func @copy_1d_1024xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Too much data for all threads, we do not try to recover here, this is the
+  /// job of higher-level transformations to select better tile sizes and number
+  /// of threads.
+
+  // expected-note @below {{target op}}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<257xf16>
+
+// NO-CHECK-LABEL-ON-EXPECTED-ERROR
+func.func @copy_1d_257xf16(%t0: !tt, %out: !tt) -> !tt {
+  /// Too much data for all threads, we do not try to recover here, this is the
+  /// job of higher-level transformations to select better tile sizes and number
+  /// of threads.
+  
+  // expected-note @below {{target op}}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 128
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<512xi8>
+
+// NO-CHECK-LABEL-ON-EXPECTED-ERROR
+func.func @copy_1d_512xi8(%t0: !tt, %out: !tt) -> !tt {
+  /// Too much data for all threads given the forced alignment to 8b, 
+  /// we do not try to recover here, this is the job of higher-level 
+  /// transformations to select better tile sizes and number of threads.
+  // expected-note @below {{target op}}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 8
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}
+
+// -----
+
+!tt = tensor<16x32x64xi8>
+
+// NO-CHECK-LABEL-ON-EXPECTED-ERROR
+func.func @copy_3d_16x32x64xi8(%t0: !tt, %out: !tt) -> !tt {
+  /// Too much data for all threads given the forced alignment to 8b, 
+  /// we do not try to recover here, this is the job of higher-level 
+  /// transformations to select better tile sizes and number of threads.
+  // expected-note @below {{target op}}
+  %0 = linalg.copy ins(%t0: !tt) outs(%out: !tt) -> !tt 
+  return %0 : !tt
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg1: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.copy"]} in %arg1 
+    : (!transform.any_op) -> !transform.any_op
+  // expected-error @below {{too few threads to map copy op to threads on the most minor dimension, given alignment and vector size constraints}}
+  transform.structured.gpu.map_copy_to_threads %0 
+    total_num_threads = 32 desired_bit_alignment = 8
+      : (!transform.any_op) -> (!transform.op<"linalg.copy">)
+}