[Mlir-commits] [mlir] [mlir][AMDGPU] Support vector<2xbf16> packed atomic fadd (PR #113929)

Mon Oct 28 15:35:09 PDT 2024

https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/113929

>From 4fd02f3cd6564db841d796884ec772e28b439531 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 11 Sep 2024 15:09:54 +0000
Subject: [PATCH] [mlir][AMDGPU] Support vector<2xbf16> packed atomic fadd

Now that we use LLVM's native bfloat types in the AMDGPU lowering,
enable vector<2xbf16> for AMDGPU.
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td         |  2 +-
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp   |  9 +--------
 .../Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir     | 11 +++++++++++
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index cba35bbca1f83b..69745addfd748e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -254,7 +254,7 @@ def AMDGPU_RawBufferAtomicCmpswapOp :
 def AMDGPU_RawBufferAtomicFaddOp :
     AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
       AttrSizedOperandSegments]>,
-    Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16]>]>:$value,
+    Arguments<(ins AnyTypeOf<[F32, VectorOfLengthAndType<[2], [F16, BF16]>]>:$value,
                    Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
                    Variadic<I32>:$indices,
                    DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 0ccd4133d3761d..5a7897f233eaa8 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -98,15 +98,8 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     // bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32
     // and the total load size is >= 32, use a vector load of N / (bitsize(T) /
     // 32) x i32 and bitcast. Also, the CAS intrinsic requires integer operands,
-    // so bitcast any floats to integers. On top of all this, cast bfloat
-    // (vectors) to i16 since the backend doesn't currently support bfloat on
-    // these operations.
+    // so bitcast any floats to integers.
     Type llvmBufferValType = llvmWantedDataType;
-    if (wantedDataType.isBF16())
-      llvmBufferValType = rewriter.getI16Type();
-    if (auto wantedVecType = dyn_cast<VectorType>(wantedDataType))
-      if (wantedVecType.getElementType().isBF16())
-        llvmBufferValType = wantedVecType.clone(rewriter.getI16Type());
     if (atomicCmpData) {
       if (auto floatType = dyn_cast<FloatType>(wantedDataType))
         llvmBufferValType = this->getTypeConverter()->convertType(
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 7fd5610a88913e..a9ea44925e9140 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -163,6 +163,17 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_v2f16(%value: vector<2xf16>, %buf: mem
   func.return
 }
 
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16
+func.func @gpu_gcn_raw_buffer_atomic_fadd_v2bf16(%value: vector<2xbf16>, %buf: memref<64xbf16>, %idx: i32) {
+  // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(128 : i32)
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
+  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
+  // CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : vector<2xbf16>
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : vector<2xbf16> -> memref<64xbf16>, i32
+  func.return
+}
+
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
 func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
   // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)