[Mlir-commits] [mlir] Add Lowerings for GPU WMMA F16/F32 ops to ROCDL dialect (PR #69357)

Mon Oct 23 08:03:29 PDT 2023

================
@@ -10,42 +10,91 @@
 
 #include "mlir/Conversion/GPUToROCDL/Runtimes.h"
 #include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include <memory>
 
+namespace llvm {
+class StringRef;
+} // namespace llvm
+
 namespace mlir {
 class LLVMTypeConverter;
 class ConversionTarget;
+class OpBuilder;
+class Location;
 class RewritePatternSet;
+class Type;
 
 template <typename OpT>
 class OperationPass;
 
 namespace gpu {
 class GPUModuleOp;
+class MMAMatrixType;
 } // namespace gpu
 
 #define GEN_PASS_DECL_CONVERTGPUOPSTOROCDLOPS
 #include "mlir/Conversion/Passes.h.inc"
 
+namespace amd {
+/// Constant representing 32 workitems in a workgroup.
+const unsigned kWaveFrontSize32 = 32;
+
+/// Constant representing 64 workitems in a workgroup.
+const unsigned kWaveFrontSize64 = 64;
+
+/// Wavefront sizes that are supported by the GPU to ROCDL lowerings.
+const unsigned kWMMASupportedWaveFrontSizes[] = {kWaveFrontSize32,
+                                                 kWaveFrontSize64};
+
+/// Generate ops to get the laneId of the current lane and return it.
+Value getLaneId(PatternRewriter &rewriter, Location loc,
+                unsigned indexBitwidth);
+
+/// Return the LLVM Type corresponding to the MMAMatrixType.
+Type convertWMMAToROCDLLLVMType(gpu::MMAMatrixType matrixType);
+} // namespace amd
+
 /// Collect a set of patterns to convert from the GPU dialect to ROCDL.
-/// If `runtime` is Unknown, gpu.printf will not be lowered
-/// The resulting pattern set should be run over a gpu.module op
-void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter,
-                                          RewritePatternSet &patterns,
-                                          gpu::amd::Runtime runtime);
+/// If `runtime` is Unknown, gpu.printf will not be lowered. The resulting
+/// pattern set should be run over a gpu.module op. `chipset` is the chip we are
+/// targeting. `indexBitwidth` is the bitwidth to be used while converting index
+/// types. `warpSize` is the warp size to use when generating WMMA intrinsics.
+void populateGpuToROCDLConversionPatterns(
+    LLVMTypeConverter &converter, RewritePatternSet &patterns,
+    gpu::amd::Runtime runtime, llvm::StringRef chipset = "gfx900",
+    unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout,
+    unsigned warpSize = 32);
----------------
krzysz00 wrote:

Same note about deriving this from the chipset version by default

https://github.com/llvm/llvm-project/pull/69357