[Mlir-commits] [llvm] [mlir] [AMDGPU][MLIR]Add shmem-optimization as an op using transform dialect (PR #81550)
llvmlistbot at llvm.org
llvmlistbot at llvm.org
Mon Feb 12 15:56:32 PST 2024
https://github.com/erman-gurses updated https://github.com/llvm/llvm-project/pull/81550
>From 86559827eaec142344ed9b5e95c40ff5793b523d Mon Sep 17 00:00:00 2001
From: erman-gurses <erman at nod-labs.com>
Date: Mon, 12 Feb 2024 17:35:32 -0600
Subject: [PATCH 1/2] Add shmem-opimization as an op using transform dialect
---
.../mlir/Dialect/AMDGPU/CMakeLists.txt | 1 +
.../AMDGPU/TransformOps/AMDGPUTransformOps.h | 49 +++++++++++
.../AMDGPU/TransformOps/AMDGPUTransformOps.td | 46 ++++++++++
.../AMDGPU/TransformOps/CMakeLists.txt | 4 +
.../Dialect/AMDGPU/Transforms/Transforms.h | 3 +
mlir/include/mlir/InitAllExtensions.h | 2 +
mlir/lib/Dialect/AMDGPU/CMakeLists.txt | 3 +-
.../TransformOps/AMDGPUTransformOps.cpp | 84 +++++++++++++++++++
.../AMDGPU/TransformOps/CMakeLists.txt | 25 ++++++
.../Transforms/OptimizeSharedMemory.cpp | 15 ++++
...transform_optimize_shmem_reads_writes.mlir | 64 ++++++++++++++
.../llvm-project-overlay/mlir/BUILD.bazel | 66 +++++++++++++++
12 files changed, 361 insertions(+), 1 deletion(-)
create mode 100644 mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
create mode 100644 mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
create mode 100644 mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
create mode 100644 mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
create mode 100644 mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
create mode 100644 mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
index 9f57627c321fb0..660deb21479d29 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/AMDGPU/CMakeLists.txt
@@ -1,2 +1,3 @@
add_subdirectory(IR)
+add_subdirectory(TransformOps)
add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
new file mode 100644
index 00000000000000..c7721f2f4e0ceb
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h
@@ -0,0 +1,49 @@
+//===- AMDGPUTransformOps.h - AMDGPU transform ops ----------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
+#define MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/RegionKindInterface.h"
+
+namespace mlir {
+namespace transform {
+class TransformHandleTypeInterface;
+} // namespace transform
+} // namespace mlir
+
+namespace mlir {
+class DialectRegistry;
+
+namespace linalg {
+class LinalgOp;
+} // namespace linalg
+
+namespace scf {
+class ForOp;
+} // namespace scf
+
+namespace amdgpu {
+void registerTransformDialectExtension(DialectRegistry ®istry);
+} // namespace amdgpu
+} // namespace mlir
+
+//===----------------------------------------------------------------------===//
+// AMDGPU Transform Operations
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h.inc"
+
+#endif // MLIR_DIALECT_AMDGPU_TRANSFORMOPS_AMDGPUTRANSFORMOPS_H
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
new file mode 100644
index 00000000000000..f028aa14097ebb
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td
@@ -0,0 +1,46 @@
+//===- AMDGPUTransformOps.td - AMDGPU transform ops ----------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_TRANSFORM_OPS
+#define AMDGPU_TRANSFORM_OPS
+
+include "mlir/Dialect/Transform/IR/TransformAttrs.td"
+include "mlir/Dialect/Transform/IR/TransformDialect.td"
+include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
+include "mlir/Dialect/Transform/IR/TransformTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+
+//===----------------------------------------------------------------------===//
+// ApplyOptimizeSharedMemoryReadsAndWritesOp
+//===----------------------------------------------------------------------===//
+
+def ApplyOptimizeSharedMemoryReadsAndWritesOp :
+ Op<Transform_Dialect, "amdgpu.optimize_shared_memory_reads_and_writes",
+ [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+ TransformOpInterface, TransformEachOpTrait]> {
+ let summary = "Reduce shared memory bank conflicts";
+ let description = [{ This opp adds a transformation and pass to the AMDGPU
+ dialect that attempts to optimize reads/writes from a memref representing
+ GPU shared memory in order to avoid bank conflicts.
+ }];
+
+ let arguments = (ins TransformHandleTypeInterface:$target);
+ let results = (outs);
+
+ let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
+
+ let extraClassDeclaration = [{
+ ::mlir::DiagnosedSilenceableFailure applyToOne(
+ ::mlir::transform::TransformRewriter &rewriter,
+ ::mlir::func::FuncOp funcOp,
+ ::mlir::transform::ApplyToEachResultList &results,
+ ::mlir::transform::TransformState &state);
+ }];
+}
+
+#endif // AMDGPU_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
new file mode 100644
index 00000000000000..07bfebc9f96d2e
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,4 @@
+set(LLVM_TARGET_DEFINITIONS AMDGPUTransformOps.td)
+mlir_tablegen(AMDGPUTransformOps.h.inc -gen-op-decls)
+mlir_tablegen(AMDGPUTransformOps.cpp.inc -gen-op-defs)
+add_public_tablegen_target(MLIRAMDGPUTransformOpsIncGen)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
index 140bc12deed690..b4e9ad27003db1 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
@@ -14,6 +14,7 @@
#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
+#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/IR/Operation.h"
#include "mlir/Support/LogicalResult.h"
@@ -48,6 +49,8 @@ namespace amdgpu {
mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
Value memrefValue);
+void optimizeSharedMemoryReadsAndWritesOp(mlir::func::FuncOp funcOp);
+
} // namespace amdgpu
} // namespace mlir
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index 7708ca5571de3b..23d88e00ded1fb 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -26,6 +26,7 @@
#include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
#include "mlir/Dialect/Func/TransformOps/FuncTransformOps.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h"
@@ -66,6 +67,7 @@ inline void registerAllExtensions(DialectRegistry ®istry) {
ub::registerConvertUBToLLVMInterface(registry);
// Register all transform dialect extensions.
+ amdgpu::registerTransformDialectExtension(registry);
affine::registerTransformDialectExtension(registry);
bufferization::registerTransformDialectExtension(registry);
func::registerTransformDialectExtension(registry);
diff --git a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
index 31167e6af908b9..63b4d8b99f53fd 100644
--- a/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/CMakeLists.txt
@@ -1,3 +1,4 @@
add_subdirectory(IR)
-add_subdirectory(Transforms)
add_subdirectory(Utils)
+add_subdirectory(TransformOps)
+add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
new file mode 100644
index 00000000000000..cc2be492286f8c
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
@@ -0,0 +1,84 @@
+//===- AMDGPUTransformOps.cpp - Implementation of AMDGPU transform ops ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
+
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/Utils/IndexingUtils.h"
+#include "mlir/Dialect/Utils/StaticValueUtils.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "llvm/ADT/ArrayRef.h"
+
+using namespace mlir;
+using namespace mlir::linalg;
+using namespace mlir::amdgpu;
+using namespace mlir::transform;
+
+#define DEBUG_TYPE "amdgpu-transforms"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define DBGSNL() (llvm::dbgs() << "\n")
+#define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
+
+DiagnosedSilenceableFailure transform::ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
+ transform::TransformRewriter &rewriter, ::mlir::func::FuncOp funcOp,
+ ::mlir::transform::ApplyToEachResultList &results,
+ ::mlir::transform::TransformState &state) {
+ mlir::amdgpu::optimizeSharedMemoryReadsAndWritesOp(funcOp);
+ return DiagnosedSilenceableFailure::success();
+}
+
+void transform::ApplyOptimizeSharedMemoryReadsAndWritesOp::getEffects(
+ SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+ transform::onlyReadsHandle(getTarget(), effects);
+ transform::modifiesPayload(effects);
+}
+
+//===----------------------------------------------------------------------===//
+// Transform op registration
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AMDGPUTransformDialectExtension
+ : public transform::TransformDialectExtension<
+ AMDGPUTransformDialectExtension> {
+public:
+ AMDGPUTransformDialectExtension() {
+ declareGeneratedDialect<arith::ArithDialect>();
+ declareGeneratedDialect<affine::AffineDialect>();
+ declareGeneratedDialect<amdgpu::AMDGPUDialect>();
+ declareGeneratedDialect<vector::VectorDialect>();
+ registerTransformOps<
+#define GET_OP_LIST
+#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"
+ >();
+ }
+};
+} // namespace
+
+#define GET_OP_CLASSES
+#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"
+
+void mlir::amdgpu::registerTransformDialectExtension(DialectRegistry ®istry) {
+ registry.addExtensions<AMDGPUTransformDialectExtension>();
+}
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
new file mode 100644
index 00000000000000..c39a3b55eabca4
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/CMakeLists.txt
@@ -0,0 +1,25 @@
+add_mlir_dialect_library(MLIRAMDGPUTransformOps
+ AMDGPUTransformOps.cpp
+
+ ADDITIONAL_HEADER_DIRS
+ ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/TransformOps
+
+ DEPENDS
+ MLIRAMDGPUTransformOpsIncGen
+
+ LINK_LIBS PUBLIC
+ MLIRAffineDialect
+ MLIRArithDialect
+ MLIRIR
+ MLIRLinalgDialect
+ MLIRAMDGPUDialect
+ MLIRAMDGPUTransforms
+ MLIRParser
+ MLIRSideEffectInterfaces
+ MLIRSCFDialect
+ MLIRSCFTransforms
+ MLIRTransformDialect
+ MLIRTransformDialectUtils
+ MLIRVectorTransforms
+
+ )
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
index c7001fc6d57d5f..4fb8242a0afcfd 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
@@ -220,6 +220,21 @@ mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
return success();
}
+void mlir::amdgpu::optimizeSharedMemoryReadsAndWritesOp(
+ ::mlir::func::FuncOp funcOp) {
+ SmallVector<memref::AllocOp> shmAllocOps;
+ funcOp.walk([&](memref::AllocOp allocOp) {
+ if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(allocOp.getType()))
+ return;
+ shmAllocOps.push_back(allocOp);
+ });
+ for (auto allocOp : shmAllocOps) {
+ if (failed(amdgpu::optimizeSharedMemoryReadsAndWrites(funcOp,
+ allocOp.getMemref())))
+ return;
+ }
+}
+
struct OptimizeSharedMemoryPass
: public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
public:
diff --git a/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
new file mode 100644
index 00000000000000..dfdd1b17e244e3
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/transform_optimize_shmem_reads_writes.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s -transform-interpreter | FileCheck %s
+
+ // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
+ func.func @optimize_shmem(%arg0: memref<4096x4096xf16>,
+ %readRow: index, %readCol: index,
+ %writeRow: index, %writeCol: index,
+ %fragRow: index, %fragCol: index,
+ %fragColPerm: index,
+ %stRow: index, %stCol: index) {
+ // CHECK: %[[cst:.+]] = arith.constant 0.000000e+00 : f16
+ %cst = arith.constant 0.000000e+00 : f16
+
+ // CHECK: [[shmA:%.+]] = memref.alloc
+ // CHECK: [[shmB:%.+]] = memref.alloc
+ %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
+ %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
+
+ // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+ %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+ // CHECK: [[c7:%.+]] = arith.constant 7 : index
+ // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]
+ // CHECK: [[c2:%.+]] = arith.constant 2 : index
+ // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+ // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
+ // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+ vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+ gpu.barrier
+ gpu.barrier
+ // CHECK: [[c7:%.+]] = arith.constant 7 : index
+ // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]
+ // CHECK: [[c2:%.+]] = arith.constant 2 : index
+ // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+ // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
+ // CHECK: vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
+ %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
+
+ // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+ %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+ // CHECK: [[c7:%.+]] = arith.constant 7 : index
+ // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]
+ // CHECK: [[c2:%.+]] = arith.constant 2 : index
+ // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+ // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]
+ // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+ vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+ gpu.barrier
+ gpu.barrier
+ // CHECK: [[c7:%.+]] = arith.constant 7 : index
+ // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]
+ // CHECK: [[c2:%.+]] = arith.constant 2 : index
+ // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]
+ // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
+ // CHECK: vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
+ %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
+ return
+ }
+
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+ %0 = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+ transform.amdgpu.optimize_shared_memory_reads_and_writes %0 : (!transform.any_op) -> ()
+ transform.yield
+ } // @__transform_main
+} // module
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 821481ee272a56..2c534c7614b10d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1511,6 +1511,70 @@ cc_library(
],
)
+cc_library(
+ name = "AMDGPUTransformOps",
+ srcs = glob([
+ "lib/Dialect/AMDGPU/TransformOps/*.cpp",
+ ]),
+ hdrs = glob([
+ "include/mlir/Dialect/AMDGPU/TransformOps/*.h",
+ ]),
+ includes = ["include"],
+ deps = [
+ ":AMDGPUDialect",
+ ":AMDGPUTransformOpsIncGen",
+ ":AMDGPUTransforms",
+ ":AffineDialect",
+ ":Analysis",
+ ":ArithDialect",
+ ":ArithUtils",
+ ":DialectUtils",
+ ":GPUCommonTransforms",
+ ":GPUCompilationAttrInterfacesIncGen",
+ ":GPUDialect",
+ ":IR",
+ ":LLVMCommonConversion",
+ ":LinalgDialect",
+ ":MemRefDialect",
+ ":SCFDialect",
+ ":SCFTransforms",
+ ":Support",
+ ":TransformDialect",
+ ":VectorDialect",
+ "//llvm:Support",
+ ],
+)
+
+td_library(
+ name = "AMDGPUTransformOpsTdFiles",
+ srcs = glob([
+ "include/mlir/Dialect/AMDGPU/TransformOps/*.td",
+ ]),
+ includes = ["include"],
+ deps = [
+ ":TransformDialectTdFiles",
+ ],
+)
+
+gentbl_cc_library(
+ name = "AMDGPUTransformOpsIncGen",
+ tbl_outs = [
+ (
+ ["-gen-op-decls"],
+ "include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h.inc",
+ ),
+ (
+ ["-gen-op-defs"],
+ "include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc",
+ ),
+ ],
+ tblgen = ":mlir-tblgen",
+ td_file = "include/mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.td",
+ deps = [
+ ":AMDGPUTransformOpsTdFiles",
+ ],
+)
+
gentbl_cc_library(
name = "AMDGPUPassIncGen",
tbl_outs = [
@@ -4614,6 +4678,7 @@ cc_library(
name = "AllExtensions",
hdrs = ["include/mlir/InitAllExtensions.h"],
deps = [
+ ":AMDGPUTransformOps",
":AffineTransformOps",
":ArithToLLVM",
":BufferizationTransformOps",
@@ -8961,6 +9026,7 @@ cc_library(
deps = [
":AMDGPUDialect",
":AMDGPUToROCDL",
+ ":AMDGPUTransformOps",
":AMDGPUTransforms",
":AMXDialect",
":AMXTransforms",
>From a63409d35ce455bb6d8fc1f547c24874c03a3124 Mon Sep 17 00:00:00 2001
From: erman-gurses <erman at nod-labs.com>
Date: Mon, 12 Feb 2024 17:48:40 -0600
Subject: [PATCH 2/2] Fix the format
---
mlir/include/mlir/InitAllExtensions.h | 2 +-
.../AMDGPU/TransformOps/AMDGPUTransformOps.cpp | 17 ++++++++++-------
2 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/mlir/include/mlir/InitAllExtensions.h b/mlir/include/mlir/InitAllExtensions.h
index 23d88e00ded1fb..b31fb26f00f8f4 100644
--- a/mlir/include/mlir/InitAllExtensions.h
+++ b/mlir/include/mlir/InitAllExtensions.h
@@ -23,10 +23,10 @@
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/NVVMToLLVM/NVVMToLLVM.h"
#include "mlir/Conversion/UBToLLVM/UBToLLVM.h"
+#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
#include "mlir/Dialect/Affine/TransformOps/AffineTransformOps.h"
#include "mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.h"
#include "mlir/Dialect/Func/Extensions/AllExtensions.h"
-#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.h"
#include "mlir/Dialect/Func/TransformOps/FuncTransformOps.h"
#include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h"
#include "mlir/Dialect/Linalg/TransformOps/DialectExtension.h"
diff --git a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
index cc2be492286f8c..fd82e2497a5799 100644
--- a/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
+++ b/mlir/lib/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp
@@ -1,4 +1,5 @@
-//===- AMDGPUTransformOps.cpp - Implementation of AMDGPU transform ops ------===//
+//===- AMDGPUTransformOps.cpp - Implementation of AMDGPU transform ops
+//------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -11,15 +12,15 @@
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/Arith/Utils/Utils.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/Linalg/IR/Linalg.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
#include "mlir/Dialect/SCF/IR/SCF.h"
#include "mlir/Dialect/SCF/Transforms/Transforms.h"
#include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -39,8 +40,9 @@ using namespace mlir::transform;
#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
#define DBGSNL() (llvm::dbgs() << "\n")
#define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
-
-DiagnosedSilenceableFailure transform::ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
+
+DiagnosedSilenceableFailure
+transform::ApplyOptimizeSharedMemoryReadsAndWritesOp::applyToOne(
transform::TransformRewriter &rewriter, ::mlir::func::FuncOp funcOp,
::mlir::transform::ApplyToEachResultList &results,
::mlir::transform::TransformState &state) {
@@ -79,6 +81,7 @@ class AMDGPUTransformDialectExtension
#define GET_OP_CLASSES
#include "mlir/Dialect/AMDGPU/TransformOps/AMDGPUTransformOps.cpp.inc"
-void mlir::amdgpu::registerTransformDialectExtension(DialectRegistry ®istry) {
+void mlir::amdgpu::registerTransformDialectExtension(
+ DialectRegistry ®istry) {
registry.addExtensions<AMDGPUTransformDialectExtension>();
}
More information about the Mlir-commits
mailing list