[Mlir-commits] [mlir] [mlir] AMDGPUToROCDL: RawBufferOpLowering fixes (PR #120642)

Thu Dec 19 13:50:16 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Ivan Butygin (Hardcode84)

<details>
<summary>Changes</summary>

1. We can use `getNumElements()` only for memrefs with trivial layout.
2. Buffer ops expecting sizes in i32 but descriptor values can be either i32 or i64, add appropriate casts. This implementation is not ideal as it can overflow, but it's still better than generating broken IR.

---
Full diff: https://github.com/llvm/llvm-project/pull/120642.diff


2 Files Affected:

- (modified) mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp (+14-6) 
- (modified) mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir (+19) 


``````````diff

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 5a7897f233eaa8..4100b086fad8ba 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -91,6 +91,13 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     Type llvmI32 = this->typeConverter->convertType(i32);
     Type llvmI16 = this->typeConverter->convertType(rewriter.getI16Type());
 
+    auto toI32 = [&](Value val) -> Value {
+      if (val.getType() == llvmI32)
+        return val;
+
+      return rewriter.create<LLVM::TruncOp>(loc, llvmI32, val);
+    };
+
     int64_t elementByteWidth = memrefType.getElementTypeBitWidth() / 8;
     Value byteWidthConst = createI32Constant(rewriter, loc, elementByteWidth);
 
@@ -166,22 +173,22 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
     Value stride = rewriter.create<LLVM::ConstantOp>(
         loc, llvmI16, rewriter.getI16IntegerAttr(0));
     Value numRecords;
-    if (memrefType.hasStaticShape()) {
+    if (memrefType.hasStaticShape() && memrefType.getLayout().isIdentity()) {
       numRecords = createI32Constant(
           rewriter, loc,
           static_cast<int32_t>(memrefType.getNumElements() * elementByteWidth));
     } else {
       Value maxIndex;
       for (uint32_t i = 0, e = memrefType.getRank(); i < e; ++i) {
-        Value size = memrefDescriptor.size(rewriter, loc, i);
-        Value stride = memrefDescriptor.stride(rewriter, loc, i);
+        Value size = toI32(memrefDescriptor.size(rewriter, loc, i));
+        Value stride = toI32(memrefDescriptor.stride(rewriter, loc, i));
         stride = rewriter.create<LLVM::MulOp>(loc, stride, byteWidthConst);
         Value maxThisDim = rewriter.create<LLVM::MulOp>(loc, size, stride);
         maxIndex = maxIndex ? rewriter.create<LLVM::MaximumOp>(loc, maxIndex,
                                                                maxThisDim)
                             : maxThisDim;
       }
-      numRecords = rewriter.create<LLVM::TruncOp>(loc, llvmI32, maxIndex);
+      numRecords = maxIndex;
     }
 
     // Flag word:
@@ -218,7 +225,8 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
       Value strideOp;
       if (ShapedType::isDynamic(strides[i])) {
         strideOp = rewriter.create<LLVM::MulOp>(
-            loc, memrefDescriptor.stride(rewriter, loc, i), byteWidthConst);
+            loc, toI32(memrefDescriptor.stride(rewriter, loc, i)),
+            byteWidthConst);
       } else {
         strideOp =
             createI32Constant(rewriter, loc, strides[i] * elementByteWidth);
@@ -240,7 +248,7 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
       sgprOffset = createI32Constant(rewriter, loc, 0);
     if (ShapedType::isDynamic(offset))
       sgprOffset = rewriter.create<LLVM::AddOp>(
-          loc, memrefDescriptor.offset(rewriter, loc), sgprOffset);
+          loc, toI32(memrefDescriptor.offset(rewriter, loc)), sgprOffset);
     else if (offset > 0)
       sgprOffset = rewriter.create<LLVM::AddOp>(
           loc, sgprOffset, createI32Constant(rewriter, loc, offset));
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index a9ea44925e9140..4c7515dc810516 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -30,6 +30,25 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
   func.return %0 : i32
 }
 
+// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_strided
+func.func @gpu_gcn_raw_buffer_load_i32_strided(%buf: memref<64xi32, strided<[?], offset: ?>>, %idx: i32) -> i32 {
+  // CHECK-DAG: %[[rstride:.*]] = llvm.mlir.constant(0 : i16)
+  // CHECK-DAG: %[[elem_size:.*]] = llvm.mlir.constant(4 : i32)
+  // CHECK: %[[size:.*]] = llvm.extractvalue %{{.*}}[3, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+  // CHECK: %[[size32:.*]] = llvm.trunc %[[size]] : i64 to i32
+  // CHECK: %[[stride:.*]] = llvm.extractvalue %{{.*}}[4, 0] : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)>
+  // CHECK: %[[stride32:.*]] = llvm.trunc %[[stride]] : i64 to i32
+  // CHECK: %[[tmp:.*]] = llvm.mul %[[stride32]], %[[elem_size]] : i32
+  // CHECK: %[[numRecords:.*]] = llvm.mul %[[size32]], %[[tmp]] : i32
+  // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+  // RDNA:  %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
+  // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[rstride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
+  // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+  // CHECK: return %[[ret]]
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %buf[%idx] : memref<64xi32, strided<[?], offset: ?>>, i32 -> i32
+  func.return %0 : i32
+}
+
 // CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
 func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
   // GFX9:  %[[flags:.*]] = llvm.mlir.constant(159744 : i32)

``````````

</details>


https://github.com/llvm/llvm-project/pull/120642