[llvm] [mlir] [mlir][amdgpu] Remove shared memory optimization pass (PR #88225)

Tue Apr 9 20:42:10 PDT 2024

https://github.com/kuhar created https://github.com/llvm/llvm-project/pull/88225

This implementation has a number of issues and ultimately does not work on gfx9.
* It does not reduce bank conflicts with wide memory accesses.
* It does not correctly account for when LDS bank conflicts occur on amdgpu.
* The implementation is too fragile to be used on real-world code. For example, the code bails out on any `memref.subview` in the root op, even when the subview is not a user of any of the `memref.alloc` ops.

I do not see how these can be easily fixed, therefore I think it's better to delete this code.

>From cb942636bd05a57896bdd3816a44817882337a98 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub at nod-labs.com>
Date: Tue, 9 Apr 2024 23:27:00 -0400
Subject: [PATCH] [mlir][amdgpu] Remove the shared memory optimization pass

This reverts commit 87c0260f45e5a02cb07722d089dae3f0f84c7b3d.

This implementation has a number of issues and ultimately does not work
on gfx9.
* It does not reduce bank conflicts with wide memory accesses.
* It does not consider when LDS bank conflicts work on amdgpu.
* The implementation is too fragile to be used on real-world code. For
  example, the code bails out on any `memref.subview` in the root op,
  even when the subview is not a user of any of the `memref.alloc` ops.

I do not see how these can be easily fixed, therefore I think it's
better to delete this code.
---
 .../mlir/Dialect/AMDGPU/CMakeLists.txt        |   1 -
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  17 --
 .../AMDGPU/TransformOps/AMDGPUTransformOps.h  |  48 ----
 .../AMDGPU/TransformOps/AMDGPUTransformOps.td |  47 ----
 .../AMDGPU/TransformOps/CMakeLists.txt        |   4 -
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |   2 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  |  20 --
 .../Dialect/AMDGPU/Transforms/Transforms.h    |  61 ----
 .../mlir/Dialect/AMDGPU/Transforms/Utils.h    |  24 --
 mlir/include/mlir/InitAllExtensions.h         |   2 -
 mlir/lib/Dialect/AMDGPU/CMakeLists.txt        |   1 -
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |  15 -
 .../TransformOps/AMDGPUTransformOps.cpp       |  67 -----
 .../AMDGPU/TransformOps/CMakeLists.txt        |  25 --
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |   3 -
 .../Transforms/OptimizeSharedMemory.cpp       | 261 ------------------
 mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp  |  39 ---
 .../AMDGPU/optimize_shmem_reads_writes.mlir   |  50 ----
 ...transform_optimize_shmem_reads_writes.mlir |  54 ----
 .../llvm-project-overlay/mlir/BUILD.bazel     |  54 ----
 20 files changed, 1 insertion(+), 794 deletions(-)
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
 delete mode 100644 mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
 delete mode 100644 mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
 delete mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
 delete mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
 delete mode 100644 mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
 delete mode 100644 mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
index 660deb21479d29..9f57627c321fb0 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_subdirectory(IR)
-add_subdirectory(TransformOps)
 add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 21942b179a0013..3f27e1541cf38c 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -29,23 +29,6 @@ def AMDGPU_Dialect : Dialect {
     "gpu::GPUDialect"
   ];
   let useDefaultAttributePrinterParser = 1;
-
-  let extraClassDeclaration = [{
-    /// Return true if the given MemRefType has an integer address
-    /// space that matches the ROCDL shared memory address space or
-    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
-    static bool hasSharedMemoryAddressSpace(MemRefType type);
-
-    /// Return true if the given Attribute has an integer address
-    /// space that matches the ROCDL shared memory address space or
-    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
-    static bool isSharedMemoryAddressSpace(Attribute type);
-
-    /// Defines the MemRef memory space attribute numeric value that indicates
-    /// a memref is located in shared memory. This should correspond to the
-    /// value used in ROCDL.
-    static constexpr unsigned kSharedMemoryAddressSpace = 3;
-  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
deleted file mode 100644
index dcf934c71dd1fc..00000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===- AMDGPUTransformOps.h - AMDGPU transform ops ---------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
-#define MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
-#include "mlir/Dialect/Transform/IR/TransformDialect.h"
-#include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h"
-#include "mlir/IR/OpImplementation.h"
-#include "mlir/IR/RegionKindInterface.h"
-
-namespace mlir {
-namespace transform {
-class TransformHandleTypeInterface;
-} // namespace transform
-} // namespace mlir
-
-namespace mlir {
-class DialectRegistry;
-
-namespace linalg {
-class LinalgOp;
-} // namespace linalg
-
-namespace scf {
-class ForOp;
-} // namespace scf
-
-namespace amdgpu {
-void registerTransformDialectExtension(DialectRegistry &registry);
-} // namespace amdgpu
-} // namespace mlir
-
-//===----------------------------------------------------------------------===//
-// AMDGPU Transform Operations
-//===----------------------------------------------------------------------===//
-
-#define GET_OP_CLASSES
-#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h.inc"
-
-#endif // MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
deleted file mode 100644
index 8aaa87511a2be6..00000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
+++ /dev/null
@@ -1,47 +0,0 @@
-//===- AMDGPUTransformOps.td - AMDGPU transform ops --------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef AMDGPU_TRANSFORM_OPS
-#define AMDGPU_TRANSFORM_OPS
-
-include "mlir/Dialect/Transform/IR/TransformAttrs.td"
-include "mlir/Dialect/Transform/IR/TransformDialect.td"
-include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.td"
-include "mlir/Dialect/Transform/IR/TransformTypes.td"
-
-include "mlir/Interfaces/SideEffectInterfaces.td"
-//===----------------------------------------------------------------------===//
-// ApplyOptimizeSharedMemoryReadsAndWritesOp
-//===----------------------------------------------------------------------===//
-
-def ApplyOptimizeSharedMemoryReadsAndWritesOp :
-  Op<Transform_Dialect, "amdgpu.optimize_shared_memory_reads_and_writes",
-    [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
-     TransformOpInterface, TransformEachOpTrait]> {
-  let summary = "Reduce shared memory bank conflicts";
-  let description = [{ This op attempts to optimize GPU Shared memory
-    reads/writes with the goal of avoiding bank conflicts.
-  }];
-
-  let arguments = (ins TransformHandleTypeInterface:$target,
-                    DefaultValuedOptionalAttr<I64Attr, "128">:$sharedMemoryLineSizeBytes,
-                    DefaultValuedOptionalAttr<I64Attr, "128">:$defaultVectorSizeBits);
-  let results = (outs);
-
-  let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
-
-  let extraClassDeclaration = [{
-    ::mlir::DiagnosedSilenceableFailure applyToOne(
-        ::mlir::transform::TransformRewriter &rewriter,
-        ::mlir::func::FuncOp funcOp,
-        ::mlir::transform::ApplyToEachResultList &results,
-        ::mlir::transform::TransformState &state);
-  }];
-}
-
-#endif // AMDGPU_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
deleted file mode 100644
index 07bfebc9f96d2e..00000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS AMDGPUTransformOps.td)
-mlir_tablegen(AMDGPUTransformOps.h.inc -gen-op-decls)
-mlir_tablegen(AMDGPUTransformOps.cpp.inc -gen-op-defs)
-add_public_tablegen_target(MLIRAMDGPUTransformOpsIncGen) 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index ab695756d2a789..8dd5ff1a4b198a 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -20,7 +20,7 @@ namespace mlir {
 class ConversionTarget;
 namespace amdgpu {
 
-#define GEN_PASS_DECL
+#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index 67f951fd19d172..e6b27aa842dfcd 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -30,24 +30,4 @@ def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
                         "Chipset that these operations will run on">];
 }
 
-def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
-  let summary = "Optimizes accesses to shared memory memrefs in order to reduce bank conflicts.";
-  let description = [{
-    This pass adds a transformation and pass to the AMDGPU dialect that
-    attempts to optimize reads/writes from a memref representing GPU shared
-    memory in order to avoid bank conflicts.
-  }];
-  let dependentDialects = [
-    "memref::MemRefDialect", "vector::VectorDialect"
-  ];
-  let options = [
-    Option<"sharedMemoryLineSizeBytes", "shared-memory-line-size-bytes", "int64_t",
-           /*default=*/"128",
-           "Shared memory line size in bytes">,
-    Option<"defaultVectorSizeBits", "default-vector-size-bits", "int64_t",
-           /*default=*/"128",
-           "Default vector size in bits">,
-  ];
-}
-
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
deleted file mode 100644
index 843cea2c503b9a..00000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ /dev/null
@@ -1,61 +0,0 @@
-//===- Transforms.h - AMDGPU Dialect transformations -------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares functions that assist transformations for the amdgpu
-// dialect.
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
-#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-class RewriterBase;
-
-namespace amdgpu {
-
-///
-/// Passes
-///
-
-/// Optimizes vectorized accesses to a shared memory buffer specified by
-/// memrefValue. This transformation assumes the following:
-/// 1) All relevant accesses to `memrefValue` are contained with `parentOp`.
-/// 2) The function will fail precondition checks if any subviews are
-/// taken of `memrefValue`. All reads/writes to `memrefValue` should occur
-/// through `memrefValue` directly.
-///
-/// Shared memory bank conflicts occur when multiple threads attempt to read or
-/// write locations assigned to the same shared memory bank. For `2^N` byte
-/// vectorized accesses, we need to be concerned with conflicts among threads
-/// identified as `(tid) -> tid.floordiv(2^{7-N})`. As such, this transformation
-/// changes any indexed memory access (vector.load, memref.load, etc)
-/// such that the final dimension's index value is permuted such that
-/// `newColIndex = oldColIndex % vectorSize +
-/// perm[rowIndex](oldColIndex/vectorSize, rowIndex)` where `rowIndex` is the
-/// index for the second-to last dimension and `perm[rowIndex]` is a permutation
-/// function that depends on the row Index. The permutation function is chosen
-/// to ensure that sequential distributed+vectorized reads/writes down a single
-/// dimension of the memref have minimal conflicts.
-LogicalResult
-optimizeSharedMemoryReadsAndWrites(Operation *parentOp, Value memrefValue,
-                                   int64_t sharedMemoryLineSizeBytes,
-                                   int64_t defaultVectorSizeBits);
-
-std::optional<LogicalResult>
-optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
-                                     int64_t sharedMemoryLineSizeBytes,
-                                     int64_t defaultVectorSizeBits);
-
-} // namespace amdgpu
-} // namespace mlir
-
-#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
deleted file mode 100644
index 9e5e9589d62f35..00000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- Utils.h - Transform utilities -----------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/Operation.h"
-
-namespace mlir {
-namespace amdgpu {
-
-/// Get and set the indices that the given load/store operation is operating on.
-/// Preconditions:
-/// - The Op must have memory affects.
-/// - Considers memref::LoadOp, vector::LoadOp, and vector::TransferReadOp.
-/// - Considers memref::StoreOp, vector::StoreOp, and vector::TransferWriteOp.
-/// - Excludes subview op.
-std::optional<Operation::operand_range> getIndices(Operation *op);
-void setIndices(Operation *op, ArrayRef<Value> indices);
-
-} // namespace amdgpu
-} // namespace mlir
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index b31fb26f00f8f4..7708ca5571de3b 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -23,7 +23,6 @@
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
 #include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
-#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
 #include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
 #include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
@@ -67,7 +66,6 @@ inline void registerAllExtensions(DialectRegistry &registry) {
   ub::registerConvertUBToLLVMInterface(registry);
 
   // Register all transform dialect extensions.
-  amdgpu::registerTransformDialectExtension(registry);
   affine::registerTransformDialectExtension(registry);
   bufferization::registerTransformDialectExtension(registry);
   func::registerTransformDialectExtension(registry);
diff --git a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
index c47e4c5495c17b..31167e6af908b9 100644
--- a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
@@ -1,4 +1,3 @@
 add_subdirectory(IR)
-add_subdirectory(TransformOps)
 add_subdirectory(Transforms)
 add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 4e72fbf56b80a4..2575ad4984814b 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -43,21 +43,6 @@ void AMDGPUDialect::initialize() {
       >();
 }
 
-bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
-  if (!memorySpace)
-    return false;
-  if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
-    return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
-  if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
-    return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
-  return false;
-}
-
-bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
-  Attribute memorySpace = type.getMemorySpace();
-  return isSharedMemoryAddressSpace(memorySpace);
-}
-
 //===----------------------------------------------------------------------===//
 // 8-bit float ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
deleted file mode 100644
index b7e17a92897389..00000000000000
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===- AMDGPUTransformOps.cpp - Implementation of AMDGPU transform ops-----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-using namespace mlir;
-using namespace mlir::amdgpu;
-using namespace mlir::transform;
-using namespace mlir::func;
-
-#define DEBUG_TYPE "amdgpu-transforms"
-#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
-#define DBGSNL() (llvm::dbgs() << "\n")
-#define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
-
-DiagnosedSilenceableFailure
-ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
-    TransformRewriter &rewriter, FuncOp funcOp, ApplyToEachResultList &results,
-    TransformState &state) {
-  optimizeSharedMemoryReadsAndWritesOp(funcOp, getSharedMemoryLineSizeBytes(),
-                                       getDefaultVectorSizeBits());
-  return DiagnosedSilenceableFailure::success();
-}
-
-void ApplyOptimizeSharedMemoryReadsAndWritesOp::getEffects(
-    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
-  onlyReadsHandle(getTarget(), effects);
-  modifiesPayload(effects);
-}
-
-//===----------------------------------------------------------------------===//
-// Transform op registration
-//===----------------------------------------------------------------------===//
-
-namespace {
-class AMDGPUTransformDialectExtension
-    : public TransformDialectExtension<AMDGPUTransformDialectExtension> {
-public:
-  AMDGPUTransformDialectExtension() {
-    declareGeneratedDialect<arith::ArithDialect>();
-    declareGeneratedDialect<affine::AffineDialect>();
-    declareGeneratedDialect<amdgpu::AMDGPUDialect>();
-    declareGeneratedDialect<vector::VectorDialect>();
-    registerTransformOps<
-#define GET_OP_LIST
-#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"
-        >();
-  }
-};
-} // namespace
-
-#define GET_OP_CLASSES
-#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"
-
-void amdgpu::registerTransformDialectExtension(DialectRegistry &registry) {
-  registry.addExtensions<AMDGPUTransformDialectExtension>();
-}
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
deleted file mode 100644
index c39a3b55eabca4..00000000000000
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-add_mlir_dialect_library(MLIRAMDGPUTransformOps
-  AMDGPUTransformOps.cpp
-
-  ADDITIONAL_HEADER_DIRS
-  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/TransformOps
-
-  DEPENDS
-  MLIRAMDGPUTransformOpsIncGen
-
-  LINK_LIBS PUBLIC
-  MLIRAffineDialect
-  MLIRArithDialect
-  MLIRIR
-  MLIRLinalgDialect
-  MLIRAMDGPUDialect
-  MLIRAMDGPUTransforms
-  MLIRParser
-  MLIRSideEffectInterfaces
-  MLIRSCFDialect
-  MLIRSCFTransforms
-  MLIRTransformDialect
-  MLIRTransformDialectUtils
-  MLIRVectorTransforms
-
-  )
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index a955d585b9a1dc..0889a21bddc44c 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,5 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  OptimizeSharedMemory.cpp
-  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
@@ -19,5 +17,4 @@ add_mlir_dialect_library(MLIRAMDGPUTransforms
   MLIRPass
   MLIRTransforms
   MLIRTransformUtils
-  MLIRVectorDialect
   )
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
deleted file mode 100644
index 32fab265e03cc0..00000000000000
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-//===- OptimizeSharedMemory.cpp - MLIR AMDGPU pass implementation ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements transforms to optimize accesses to shared memory.
-// It is inspired by
-// https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-namespace amdgpu {
-#define GEN_PASS_DEF_OPTIMIZESHAREDMEMORY
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
-} // namespace amdgpu
-} // namespace mlir
-
-using namespace mlir;
-using namespace mlir::amdgpu;
-
-/// Uses `srcIndexValue` to permute `tgtIndexValue` via
-/// `result = xor(floordiv(srcIdxVal,permuteEveryN),
-///               floordiv(tgtIdxVal,vectorSize)))
-///            + tgtIdxVal % vectorSize`
-/// This is done using an optimized sequence of `arith` operations.
-static Value permuteVectorOffset(OpBuilder &b, Location loc,
-                                 ArrayRef<Value> indices, MemRefType memrefTy,
-                                 int64_t srcDim, int64_t tgtDim,
-                                 int64_t sharedMemoryLineSizeBytes,
-                                 int64_t defaultVectorSizeBits) {
-  // Adjust the src index to change how often the permutation changes
-  // if necessary.
-  Value src = indices[srcDim];
-
-  // We only want to permute every N iterations of the target dim where N is
-  // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
-  const int64_t permuteEveryN = std::max<int64_t>(
-      1, sharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
-                                       memrefTy.getElementTypeBitWidth()) /
-                                      8));
-
-  // clang-format off
-  // Index bit representation (b0 = least significant bit) for dim(1)
-  // of a `memref<?x?xDT>` is as follows:
-  // N := log2(128/elementSizeBits)
-  // M := log2(dimSize(1))
-  // then
-  // bits[0:N] = sub-vector element offset
-  // bits[N:M] = vector index
-  // clang-format on
-  int64_t n =
-      llvm::Log2_64(defaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
-  int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
-
-  // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
-  int64_t mask = (1LL << (m - n)) - 1;
-  if (permuteEveryN > 1)
-    mask = mask << llvm::Log2_64(permuteEveryN);
-  Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);
-  srcBits = b.create<arith::AndIOp>(loc, src, srcBits);
-
-  /// Use the src bits to permute the target bits b[N:M] containing the
-  /// vector offset.
-  if (permuteEveryN > 1) {
-    int64_t shlBits = n - llvm::Log2_64(permuteEveryN);
-    if (shlBits > 0) {
-      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, shlBits);
-      srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
-    } else if (shlBits < 0) {
-      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, -1 * shlBits);
-      srcBits = b.createOrFold<arith::ShRUIOp>(loc, srcBits, finalShiftVal);
-    }
-  } else {
-    Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, n);
-    srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
-  }
-
-  Value permutedVectorIdx =
-      b.create<arith::XOrIOp>(loc, indices[tgtDim], srcBits);
-  return permutedVectorIdx;
-}
-
-static void transformIndices(OpBuilder &builder, Location loc,
-                             SmallVector<Value, 4> &indices,
-                             MemRefType memrefTy, int64_t srcDim,
-                             int64_t tgtDim, int64_t sharedMemoryLineSizeBytes,
-                             int64_t defaultVectorSizeBits) {
-  indices[tgtDim] =
-      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim,
-                          sharedMemoryLineSizeBytes, defaultVectorSizeBits);
-}
-
-// Return all operations within `parentOp` that read from or write to
-// `shmMemRef`.
-static LogicalResult
-getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
-                      SmallVector<Operation *, 16> &readOps,
-                      SmallVector<Operation *, 16> &writeOps) {
-  parentOp->walk([&](Operation *op) {
-    MemoryEffectOpInterface iface = dyn_cast<MemoryEffectOpInterface>(op);
-    if (!iface)
-      return;
-    std::optional<MemoryEffects::EffectInstance> effect =
-        iface.getEffectOnValue<MemoryEffects::Read>(shmMemRef);
-    if (effect) {
-      readOps.push_back(op);
-      return;
-    }
-    effect = iface.getEffectOnValue<MemoryEffects::Write>(shmMemRef);
-    if (effect)
-      writeOps.push_back(op);
-  });
-
-  // Restrict to a supported set of ops. We also require at least 2D access,
-  // although this could be relaxed.
-  if (llvm::any_of(readOps, [](Operation *op) {
-        return !isa<memref::LoadOp, vector::LoadOp, vector::TransferReadOp>(
-                   op) ||
-               amdgpu::getIndices(op)->size() < 2;
-      }))
-    return failure();
-  if (llvm::any_of(writeOps, [](Operation *op) {
-        return !isa<memref::StoreOp, vector::StoreOp, vector::TransferWriteOp>(
-                   op) ||
-               amdgpu::getIndices(op)->size() < 2;
-      }))
-    return failure();
-
-  return success();
-}
-
-LogicalResult amdgpu::optimizeSharedMemoryReadsAndWrites(
-    Operation *parentOp, Value memrefValue, int64_t sharedMemoryLineSizeBytes,
-    int64_t defaultVectorSizeBits) {
-  auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
-  if (!memRefType ||
-      !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
-    return failure();
-
-  // Abort if the given value has any sub-views; we do not do any alias
-  // analysis.
-  bool hasSubView = false;
-  parentOp->walk([&](memref::SubViewOp subView) { hasSubView = true; });
-  if (hasSubView)
-    return failure();
-
-  // Check if this is necessary given the assumption of 128b accesses:
-  // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
-  const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
-  const int64_t rowsPerLine =
-      (8 * sharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
-      rowSize;
-  const int64_t threadGroupSize =
-      1LL << (7 - llvm::Log2_64(defaultVectorSizeBits / 8));
-  if (rowsPerLine >= threadGroupSize)
-    return failure();
-
-  // Get sets of operations within the function that read/write to shared
-  // memory.
-  SmallVector<Operation *, 16> shmReadOps;
-  SmallVector<Operation *, 16> shmWriteOps;
-  if (failed(getShmReadAndWriteOps(parentOp, memrefValue, shmReadOps,
-                                   shmWriteOps)))
-    return failure();
-
-  if (shmReadOps.empty() || shmWriteOps.empty())
-    return failure();
-
-  OpBuilder builder(parentOp->getContext());
-
-  int64_t tgtDim = memRefType.getRank() - 1;
-  int64_t srcDim = memRefType.getRank() - 2;
-
-  // Transform indices for the ops writing to shared memory.
-  while (!shmWriteOps.empty()) {
-    Operation *shmWriteOp = shmWriteOps.pop_back_val();
-    builder.setInsertionPoint(shmWriteOp);
-
-    auto indices = amdgpu::getIndices(shmWriteOp);
-    SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
-    transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
-                     defaultVectorSizeBits);
-    amdgpu::setIndices(shmWriteOp, transformedIndices);
-  }
-
-  // Transform indices for the ops reading from shared memory.
-  while (!shmReadOps.empty()) {
-    Operation *shmReadOp = shmReadOps.pop_back_val();
-    builder.setInsertionPoint(shmReadOp);
-
-    auto indices = amdgpu::getIndices(shmReadOp);
-    SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
-    transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim, sharedMemoryLineSizeBytes,
-                     defaultVectorSizeBits);
-    amdgpu::setIndices(shmReadOp, transformedIndices);
-  }
-
-  return success();
-}
-
-std::optional<LogicalResult>
-amdgpu::optimizeSharedMemoryReadsAndWritesOp(func::FuncOp funcOp,
-                                             int64_t sharedMemoryLineSizeBytes,
-                                             int64_t defaultVectorSizeBits) {
-  SmallVector<memref::AllocOp> shmAllocOps;
-  funcOp.walk([&](memref::AllocOp allocOp) {
-    if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
-      return;
-    shmAllocOps.push_back(allocOp);
-  });
-  for (auto allocOp : shmAllocOps) {
-    if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(
-            funcOp, allocOp.getMemref(), sharedMemoryLineSizeBytes,
-            defaultVectorSizeBits)))
-      return failure();
-  }
-  return success();
-}
-
-struct OptimizeSharedMemoryPass
-    : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
-public:
-  OptimizeSharedMemoryPass() = default;
-  OptimizeSharedMemoryPass(const OptimizeSharedMemoryOptions &options)
-      : OptimizeSharedMemoryBase(options) {}
-  void runOnOperation() override {
-    Operation *op = getOperation();
-    SmallVector<memref::AllocOp> shmAllocOps;
-    op->walk([&](memref::AllocOp allocOp) {
-      if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(
-              allocOp.getType()))
-        return;
-      shmAllocOps.push_back(allocOp);
-    });
-    for (auto allocOp : shmAllocOps) {
-      if (failed(optimizeSharedMemoryReadsAndWrites(op, allocOp.getMemref(),
-                                                    sharedMemoryLineSizeBytes,
-                                                    defaultVectorSizeBits)))
-        return;
-    }
-  }
-};
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
deleted file mode 100644
index 8163eeafdf1f0a..00000000000000
--- a/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-using namespace mlir;
-using namespace mlir::amdgpu;
-
-std::optional<Operation::operand_range> amdgpu::getIndices(Operation *op) {
-  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
-    return loadOp.getIndices();
-  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
-    return storeOp.getIndices();
-  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
-    return vectorReadOp.getIndices();
-  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
-    return vectorStoreOp.getIndices();
-  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
-    return transferReadOp.getIndices();
-  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
-    return transferWriteOp.getIndices();
-  return std::nullopt;
-}
-
-void amdgpu::setIndices(Operation *op, ArrayRef<Value> indices) {
-  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
-    return loadOp.getIndicesMutable().assign(indices);
-  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
-    return storeOp.getIndicesMutable().assign(indices);
-  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
-    return vectorReadOp.getIndicesMutable().assign(indices);
-  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
-    return vectorStoreOp.getIndicesMutable().assign(indices);
-  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
-    return transferReadOp.getIndicesMutable().assign(indices);
-  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
-    return transferWriteOp.getIndicesMutable().assign(indices);
-}
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
deleted file mode 100644
index 983eee732e2afe..00000000000000
--- a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
+++ /dev/null
@@ -1,50 +0,0 @@
-// RUN: mlir-opt %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
-  
-  // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
-                    %readRow: index, %readCol: index,
-                    %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index,
-                    %fragColPerm: index,
-                    %stRow: index, %stCol: index) {
-    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16
-    %cst = arith.constant 0.000000e+00 : f16
-
-    // CHECK: [[shmA:%.+]] = memref.alloc
-    // CHECK: [[shmB:%.+]] = memref.alloc
-    %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
-    %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
-
-    %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
-    vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
-    gpu.barrier
-    gpu.barrier
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
-    %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
-
-    %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
-    vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
-    gpu.barrier
-    gpu.barrier
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
-    return
-  }
diff --git a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
deleted file mode 100644
index b1bb91ffc29721..00000000000000
--- a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
+++ /dev/null
@@ -1,54 +0,0 @@
-// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
-
-  // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
-                    %readRow: index, %readCol: index,
-                    %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index,
-                    %fragColPerm: index,
-                    %stRow: index, %stCol: index) {
-    %cst = arith.constant 0.000000e+00 : f16
-
-    %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
-    %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
-
-    %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
-    vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
-    gpu.barrier
-    gpu.barrier
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
-    %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
-    vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
-    gpu.barrier
-    gpu.barrier
-    // CHECK: [[c6:%.+]] = arith.constant 6 : index
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c6]]
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
-    return
-  }
-
-module attributes { transform.with_named_sequence } {
-  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
-    transform.amdgpu.optimize_shared_memory_reads_and_writes %0 {sharedMemoryLineSizeBytes = 128, defaultVectorSizeBits = 128}: (!transform.any_op) -> ()
-    transform.yield
-  } // @__transform_main
-} // module
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 497edcfceffe4d..67052fcd399309 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1552,58 +1552,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "AMDGPUTransformOps",
-    srcs = glob([
-        "lib/Dialect/AMDGPU/TransformOps/*.cpp",
-    ]),
-    hdrs = glob([
-        "include/mlir/Dialect/AMDGPU/TransformOps/*.h",
-    ]),
-    includes = ["include"],
-    deps = [
-        ":AMDGPUDialect",
-        ":AMDGPUTransformOpsIncGen",
-        ":AMDGPUTransforms",
-        ":AffineDialect",
-        ":FuncDialect",
-        ":IR",
-        ":TransformDialect",
-        ":TransformDialectInterfaces",
-        ":VectorDialect",
-    ],
-)
-
-td_library(
-    name = "AMDGPUTransformOpsTdFiles",
-    srcs = glob([
-        "include/mlir/Dialect/AMDGPU/TransformOps/*.td",
-    ]),
-    includes = ["include"],
-    deps = [
-        ":TransformDialectTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "AMDGPUTransformOpsIncGen",
-    tbl_outs = [
-        (
-            ["-gen-op-decls"],
-            "include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h.inc",
-        ),
-        (
-            ["-gen-op-defs"],
-            "include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc",
-        ),
-    ],
-    tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td",
-    deps = [
-        ":AMDGPUTransformOpsTdFiles",
-    ],
-)
-
 gentbl_cc_library(
     name = "AMDGPUPassIncGen",
     tbl_outs = [
@@ -4787,7 +4735,6 @@ cc_library(
     name = "AllExtensions",
     hdrs = ["include/mlir/InitAllExtensions.h"],
     deps = [
-        ":AMDGPUTransformOps",
         ":AffineTransformOps",
         ":ArithToLLVM",
         ":BufferizationTransformOps",
@@ -9033,7 +8980,6 @@ cc_library(
     deps = [
         ":AMDGPUDialect",
         ":AMDGPUToROCDL",
-        ":AMDGPUTransformOps",
         ":AMDGPUTransforms",
         ":AMXDialect",
         ":AMXTransforms",