[Mlir-commits] [mlir] Add Lowerings for GPU WMMA F16/F32 ops to ROCDL dialect (PR #69357)

llvmlistbot at llvm.org llvmlistbot at llvm.org
Tue Oct 17 10:11:49 PDT 2023


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Navdeep Katel (navdeepkk-polymagelabs)

<details>
<summary>Changes</summary>

The following support is added:
1.) Lowering for GPU WMMA load op for AOp, BOp, COp. The lowering supports transposed and non-transposed loads for AOp and BOp. Only non-transposed loads are supported for COp. Loading for COp also supports the opSelect bit.
2.) Lowering for GPU WMMA mma op with support for opselect bit.
3.) Lowering for GPU WMMA store op with support for opSelect bit.

---

Patch is 144.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/69357.diff


29 Files Affected:

- (added) mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPUPass.h (+85) 
- (modified) mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h (+56-7) 
- (modified) mlir/include/mlir/Conversion/Passes.h (+1) 
- (modified) mlir/include/mlir/Conversion/Passes.td (+41-10) 
- (modified) mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt (+4) 
- (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h (+2) 
- (modified) mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td (+13) 
- (modified) mlir/lib/Conversion/CMakeLists.txt (+1) 
- (added) mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt (+18) 
- (added) mlir/lib/Conversion/GPUToAMDGPU/LowerGPUOpsToAMDGPUOps.cpp (+101) 
- (added) mlir/lib/Conversion/GPUToAMDGPU/WmmaOpsToAMDGPU.cpp (+180) 
- (modified) mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt (+1) 
- (modified) mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp (+48-39) 
- (added) mlir/lib/Conversion/GPUToROCDL/WmmaOpsToROCDL.cpp (+512) 
- (modified) mlir/test/CMakeLists.txt (+3) 
- (added) mlir/test/Conversion/GPUToAMDGPU/wmma-ops-to-amdgpu-unsupported-chipset.mlir (+10) 
- (added) mlir/test/Conversion/GPUToAMDGPU/wmma-ops-to-amdgpu-unsupported-operands.mlir (+33) 
- (added) mlir/test/Conversion/GPUToAMDGPU/wmma-ops-to-amdgpu-unsupported-warpsize.mlir (+10) 
- (added) mlir/test/Conversion/GPUToAMDGPU/wmma-ops-to-amdgpu.mlir (+34) 
- (added) mlir/test/Conversion/GPUToROCDL/wmma-ops-to-rocdl-unsupported-chipset.mlir (+30) 
- (added) mlir/test/Conversion/GPUToROCDL/wmma-ops-to-rocdl-unsupported.mlir (+181) 
- (added) mlir/test/Conversion/GPUToROCDL/wmma-ops-to-rocdl.mlir (+442) 
- (added) mlir/test/Integration/GPU/ROCM/WMMA/lit.local.cfg (+5) 
- (added) mlir/test/Integration/GPU/ROCM/WMMA/wmma_f16_16_16_16_f16.mlir (+95) 
- (added) mlir/test/Integration/GPU/ROCM/WMMA/wmma_f16_16_16_16_f16_opselect.mlir (+95) 
- (added) mlir/test/Integration/GPU/ROCM/WMMA/wmma_f16_16_16_16_f16_x2.mlir (+100) 
- (added) mlir/test/Integration/GPU/ROCM/WMMA/wmma_f32_16_16_16_f16.mlir (+86) 
- (added) mlir/test/Integration/GPU/ROCM/WMMA/wmma_f32_16_16_16_f16_a_b_transpose.mlir (+84) 
- (modified) mlir/test/lit.site.cfg.py.in (+1) 


``````````diff
diff --git a/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPUPass.h b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPUPass.h
new file mode 100644
index 000000000000000..b5d0ab97d0ec6ca
--- /dev/null
+++ b/mlir/include/mlir/Conversion/GPUToAMDGPU/GPUToAMDGPUPass.h
@@ -0,0 +1,85 @@
+//===- GPUToAMDGPUPass.h - Convert GPU kernel to AMDGPU dialect -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPUPASS_H_
+#define MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPUPASS_H_
+
+#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include <memory>
+
+namespace llvm {
+class StringRef;
+} // namespace llvm
+
+namespace mlir {
+class ConversionTarget;
+class OpBuilder;
+class Location;
+class RewritePatternSet;
+class Type;
+class TypeConverter;
+
+template <typename OpT>
+class OperationPass;
+
+namespace gpu {
+class GPUModuleOp;
+class MMAMatrixType;
+} // namespace gpu
+
+#define GEN_PASS_DECL_CONVERTGPUOPSTOAMDGPUOPS
+#include "mlir/Conversion/Passes.h.inc"
+
+namespace amd {
+/// Return the LLVM Type corresponding to the MMAMatrixType.
+Type convertWMMAToVectorType(gpu::MMAMatrixType matrixType);
+
+/// String to represent the `opSelect` attribute name.
+constexpr char kAMDGpuOpselectAttrName[] = "opSelect";
+} // namespace amd
+
+/// Collect a set of patterns to convert from the GPU dialect to AMDGPU.
+/// If `runtime` is Unknown, gpu.printf will not be lowered. The resulting
+/// pattern set should be run over a gpu.module op. `chipset` is the chip we are
+/// targeting. `warpSize` is the warp size to use when generating WMMA
+/// intrinsics. `opSelect` is used in the lowering of f16 versions of WMMA ops
+/// involving `C` operand. If `opSelect` is true upper half of the general
+/// purpose 32-bit registers is used for storing the values; If false the lower
+/// half is used.
+void populateGpuToAMDGPUConversionPatterns(TypeConverter &typeConverter,
+                                           RewritePatternSet &patterns,
+                                           llvm::StringRef chipset = "gfx1100",
+                                           unsigned warpSize = 32);
+
+/// Creates a pass that lowers GPU dialect operations to AMDGPU counterparts.
+/// The index bitwidth used for the lowering of the device side index
+/// computations is configurable. AMD gpus have a configurable warp size; valid
+/// choices are 32 and 64. We choose 32 as the default size. `opSelect` is used
+/// in the lowering of f16 versions of WMMA ops involving `C` operand. If
+/// `opSelect` is true upper half of the general purpose 32-bit registers is
+/// used for storing the values; If false the lower half is used.
+std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
+createLowerGpuOpsToAMDGPUOpsPass(const std::string &chipset = "gfx1100",
+                                 unsigned warpSize = 32);
+
+/// Collect a set of patterns to convert WMMA ops from GPU dialect to AMDGPU.
+/// `chipset` is the target chip for which the IR is being generated.
+/// `warpSize` is the warp size to use when generating WMMA intrinsics.
+void populateGpuWMMAToAMDGPUConversionPatterns(TypeConverter &typeConverter,
+                                               RewritePatternSet &patterns,
+                                               llvm::StringRef chipset,
+                                               unsigned warpSize);
+
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_GPUTOAMDGPU_GPUTOAMDGPUPASS_H_
diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
index 5647787712997b5..7b0e845cf81a520 100644
--- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
+++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h
@@ -10,42 +10,91 @@
 
 #include "mlir/Conversion/GPUToROCDL/Runtimes.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include <memory>
 
+namespace llvm {
+class StringRef;
+} // namespace llvm
+
 namespace mlir {
 class LLVMTypeConverter;
 class ConversionTarget;
+class OpBuilder;
+class Location;
 class RewritePatternSet;
+class Type;
 
 template <typename OpT>
 class OperationPass;
 
 namespace gpu {
 class GPUModuleOp;
+class MMAMatrixType;
 } // namespace gpu
 
 #define GEN_PASS_DECL_CONVERTGPUOPSTOROCDLOPS
 #include "mlir/Conversion/Passes.h.inc"
 
+namespace amd {
+/// Constant representing 32 workitems in a workgroup.
+const unsigned kWaveFrontSize32 = 32;
+
+/// Constant representing 64 workitems in a workgroup.
+const unsigned kWaveFrontSize64 = 64;
+
+/// Wavefront sizes that are supported by the GPU to ROCDL lowerings.
+const unsigned kWMMASupportedWaveFrontSizes[] = {kWaveFrontSize32,
+                                                 kWaveFrontSize64};
+
+/// Generate ops to get the laneId of the current lane and return it.
+Value getLaneId(PatternRewriter &rewriter, Location loc,
+                unsigned indexBitwidth);
+
+/// Return the LLVM Type corresponding to the MMAMatrixType.
+Type convertWMMAToROCDLLLVMType(gpu::MMAMatrixType matrixType);
+} // namespace amd
+
 /// Collect a set of patterns to convert from the GPU dialect to ROCDL.
-/// If `runtime` is Unknown, gpu.printf will not be lowered
-/// The resulting pattern set should be run over a gpu.module op
-void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter,
-                                          RewritePatternSet &patterns,
-                                          gpu::amd::Runtime runtime);
+/// If `runtime` is Unknown, gpu.printf will not be lowered. The resulting
+/// pattern set should be run over a gpu.module op. `chipset` is the chip we are
+/// targeting. `indexBitwidth` is the bitwidth to be used while converting index
+/// types. `warpSize` is the warp size to use when generating WMMA intrinsics.
+void populateGpuToROCDLConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    gpu::amd::Runtime runtime, llvm::StringRef chipset = "gfx900",
+    unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
+    unsigned warpSize = 32);
 
 /// Configure target to convert from the GPU dialect to ROCDL.
 void configureGpuToROCDLConversionLegality(ConversionTarget &target);
 
 /// Creates a pass that lowers GPU dialect operations to ROCDL counterparts. The
 /// index bitwidth used for the lowering of the device side index computations
-/// is configurable.
+/// is configurable. AMD gpus have a configurable warp size; valid choices are
+/// 32 and 64. We choose 32 as the default size.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
 createLowerGpuOpsToROCDLOpsPass(
     const std::string &chipset = "gfx900",
     unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
     bool useBarePtrCallConv = false,
-    gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown);
+    gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown,
+    unsigned warpSize = 32);
+
+/// Collect a set of patterns to convert WMMA ops from GPU dialect to ROCDL.
+/// `chipset` is the target chip for which the IR is being generated.
+/// `indexBitwidth` is the bitwidth to be used while converting index types.
+/// `warpSize` is the warp size to use when generating WMMA intrinsics.
+void populateGpuWMMAToROCDLConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    llvm::StringRef chipset = "gfx900",
+    unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
+    unsigned warpSize = 32);
 
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index e714f5070f23db8..9a4f9812253d81b 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -30,6 +30,7 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/FuncToSPIRV/FuncToSPIRVPass.h"
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPUPass.h"
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
 #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
 #include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index a269fb4a83af41f..5ea284774a9823b 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -495,6 +495,30 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
   let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
+//===----------------------------------------------------------------------===//
+// GPUToAMDGPU
+//===----------------------------------------------------------------------===//
+
+def ConvertGpuOpsToAMDGPUOps : Pass<"convert-gpu-to-amdgpu", "gpu::GPUModuleOp"> {
+  let summary = "Generate AMD GPU operations for gpu operations";
+  let constructor = "mlir::createLowerGpuOpsToAMDGPUOpsPass()";
+  let dependentDialects = [
+    "amdgpu::AMDGPUDialect",
+  ];
+  let options = [
+    Option<"chipset", "chipset", "std::string",
+           /*default=*/"\"gfx000\"",
+           "Chipset that these operations will run on">,
+    Option<"indexBitwidth", "index-bitwidth", "unsigned",
+           /*default=kDeriveIndexBitwidthFromDataLayout*/ "0",
+           "Bitwidth of the index type, 0 to use size of machine word">,
+    Option<"warpSize", "warp-size", "unsigned",
+           /*default=*/"32",
+           "AMD GPUs have a configurable warp size; valid choices are 32 and "
+           "64. 32 is used as the default size.">,
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // GPUToNVVM
 //===----------------------------------------------------------------------===//
@@ -539,23 +563,30 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
            /*default=*/"\"gfx000\"",
            "Chipset that these operations will run on">,
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
-           /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
+           /*default=kDeriveIndexBitwidthFromDataLayout*/ "0",
            "Bitwidth of the index type, 0 to use size of machine word">,
     Option<"useBarePtrCallConv", "use-bare-ptr-memref-call-conv", "bool",
            /*default=*/"false",
            "Replace memref arguments in GPU functions with bare pointers."
            "All memrefs must have static shape">,
     Option<"runtime", "runtime", "::mlir::gpu::amd::Runtime",
-          "::mlir::gpu::amd::Runtime::Unknown",
-          "Runtime code will be run on (default is Unknown, can also use HIP or OpenCl)",
-          [{::llvm::cl::values(
-            clEnumValN(::mlir::gpu::amd::Runtime::Unknown, "unknown", "Unknown (default)"),
-            clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
-            clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL")
-          )}]>,
+           "::mlir::gpu::amd::Runtime::Unknown",
+           "Runtime code will be run on (default is Unknown, can also use HIP "
+           "or OpenCl)",
+           [{::llvm::cl::values(
+               clEnumValN(::mlir::gpu::amd::Runtime::Unknown, "unknown",
+                          "Unknown (default)"),
+               clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
+               clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL",
+                          "OpenCL"))}]>,
     Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-               /*default=*/"true", "Generate LLVM IR using opaque pointers "
-               "instead of typed pointers">,
+           /*default=*/"true",
+           "Generate LLVM IR using opaque pointers "
+           "instead of typed pointers">,
+    Option<"warpSize", "warp-size", "unsigned",
+           /*default=*/"32",
+           "AMD GPUs have a configurable warp size; valid choices are 32 and "
+           "64. 32 is used as the default size.">,
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
index 64de028c7fe4061..4d0caae203c7d31 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/LLVMIR/CMakeLists.txt
@@ -65,6 +65,10 @@ add_public_tablegen_target(MLIRNVVMConversionsIncGen)
 add_mlir_dialect(ROCDLOps rocdl)
 add_mlir_doc(ROCDLOps ROCDLDialect Dialects/ -gen-dialect-doc -dialect=rocdl)
 set(LLVM_TARGET_DEFINITIONS ROCDLOps.td)
+mlir_tablegen(ROCDLOpsEnums.h.inc -gen-enum-decls)
+mlir_tablegen(ROCDLOpsEnums.cpp.inc -gen-enum-defs)
+mlir_tablegen(ROCDLOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=rocdl)
+mlir_tablegen(ROCDLOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=rocdl)
 mlir_tablegen(ROCDLConversions.inc -gen-llvmir-conversions)
 mlir_tablegen(ROCDLOpsAttributes.h.inc -gen-attrdef-decls -attrdefs-dialect=rocdl)
 mlir_tablegen(ROCDLOpsAttributes.cpp.inc -gen-attrdef-defs -attrdefs-dialect=rocdl)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
index c2a82ffc1c43cf6..54e9980bb213f59 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLDialect.h
@@ -28,6 +28,8 @@
 #include "mlir/IR/OpDefinition.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
+#include "mlir/Dialect/LLVMIR/ROCDLOpsEnums.h.inc"
+
 ///// Ops /////
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/LLVMIR/ROCDLOpsAttributes.h.inc"
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 6c6419bf238b457..55d5c018f7430bb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -15,6 +15,7 @@
 
 include "mlir/Dialect/GPU/IR/CompilationAttrInterfaces.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
+include "mlir/IR/EnumAttr.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
@@ -262,6 +263,18 @@ class ROCDL_Wmma_IntrOp<string mnemonic, list<Trait> traits = []> :
     "$args attr-dict `:` functional-type($args, $res)";
 }
 
+def ROCDLWMMAFragA : I32EnumAttrCase<"a", 0>;
+def ROCDLWMMAFragB : I32EnumAttrCase<"b", 1>;
+def ROCDLWMMAFragC : I32EnumAttrCase<"c", 2>;
+
+/// Enum attribute of the different frag types.
+def ROCDLWMMAFrag
+    : I32EnumAttr<"ROCDLWMMAFrag", "ROCDL WMMA frag type",
+                  [ROCDLWMMAFragA, ROCDLWMMAFragB, ROCDLWMMAFragC]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::ROCDL";
+}
+
 // Available on RDNA3
 def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">;
 def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">;
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index 35790254be137be..6a7bee3a10866cd 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -19,6 +19,7 @@ add_subdirectory(ConvertToLLVM)
 add_subdirectory(FuncToLLVM)
 add_subdirectory(FuncToSPIRV)
 add_subdirectory(GPUCommon)
+add_subdirectory(GPUToAMDGPU)
 add_subdirectory(GPUToNVVM)
 add_subdirectory(GPUToROCDL)
 add_subdirectory(GPUToSPIRV)
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
new file mode 100644
index 000000000000000..7e201484a76cf30
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRGPUToAMDGPUTransforms
+  LowerGPUOpsToAMDGPUOps.cpp
+  WmmaOpsToAMDGPU.cpp
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRArithToLLVM
+  MLIRFuncToLLVM
+  MLIRGPUDialect
+  MLIRGPUToGPURuntimeTransforms
+  MLIRLLVMCommonConversion
+  MLIRLLVMDialect
+  MLIRMemRefToLLVM
+  MLIRROCDLDialect
+  MLIRPass
+  )
diff --git a/mlir/lib/Conversion/GPUToAMDGPU/LowerGPUOpsToAMDGPUOps.cpp b/mlir/lib/Conversion/GPUToAMDGPU/LowerGPUOpsToAMDGPUOps.cpp
new file mode 100644
index 000000000000000..c20d8eedea13361
--- /dev/null
+++ b/mlir/lib/Conversion/GPUToAMDGPU/LowerGPUOpsToAMDGPUOps.cpp
@@ -0,0 +1,101 @@
+//===- LowerGpuOpsToAMDGPUOps.cpp - MLIR GPU to AMD GPU lowering passes ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass to generate AMDGPU operations for higher-level
+// GPU operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/GPUToAMDGPU/GPUToAMDGPUPass.h"
+#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_CONVERTGPUOPSTOAMDGPUOPS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+using namespace mlir;
+
+namespace {
+struct LowerGpuOpsToAMDGPUOpsPass
+    : public impl::ConvertGpuOpsToAMDGPUOpsBase<LowerGpuOpsToAMDGPUOpsPass> {
+  LowerGpuOpsToAMDGPUOpsPass() = default;
+  LowerGpuOpsToAMDGPUOpsPass(const std::string &chipset, unsigned warpSize) {
+    if (this->chipset.getNumOccurrences() == 0)
+      this->chipset = chipset;
+    if (this->warpSize.getNumOccurrences() == 0)
+      this->warpSize = warpSize;
+  }
+
+  void runOnOperation() override {
+    gpu::GPUModuleOp m = getOperation();
+    MLIRContext *ctx = m.getContext();
+
+    // Request C wrapper emission.
+    for (auto func : m.getOps<func::FuncOp>()) {
+      func->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(),
+                    UnitAttr::get(ctx));
+    }
+
+    FailureOr<amdgpu::Chipset> maybeChipset = amdgpu::Chipset::parse(chipset);
+    if (failed(maybeChipset)) {
+      emitError(UnknownLoc::get(ctx), "Invalid chipset name: " + chipset);
+      return signalPassFailure();
+    }
+
+    TypeConverter converter;
+
+    RewritePatternSet amdgpuPatterns(ctx);
+
+    populateGpuToAMDGPUConversionPatterns(converter, amdgpuPatterns,
+                                          this->chipset, this->warpSize);
+    ConversionTarget target(*ctx);
+    // We do not mark GPU dialect illegal as other GPU ops and WMMA ops
+    // unsupported by pattersn defined here are still allowed.
+    target.addLegalDialect<amdgpu::AMDGPUDialect>();
+
+    if (failed(applyPartialConversion(m, target, std::move(amdgpuPatterns))))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+void mlir::populateGpuToAMDGPUConversionPatterns(TypeConverter &converter,
+                                                 RewritePatternSet &patterns,
+                                                 StringRef chipset,
+                                                 unsigned warpSize) {
+  // Lowering for MMAMatrixType.
+  converter.addConversion([&](gpu::MMAMatrixType type) -> Type {
+    return amd::convertWMMAToROCDLLLVMType(type);
+  });
+
+  // We need to add target and source materializations so that the IR still
+  // remains valid after the `gpu.mma_matrix` type conversion is done.
+  auto buildUnrealizedCast = [](OpBuilder &builder, Type type,
+                                ValueRange inputs, Location loc) {
+    auto cast = builder.create<UnrealizedConversionCastOp>(loc, type, inputs);
+    return std::optional<Value>(cast.getResult(0));
+  };
+  converter.addSourceMaterialization(buildUnrealizedCast);
+  converter.addTargetMaterialization(buildUn...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/69357


More information about the Mlir-commits mailing list