[Mlir-commits] [mlir] d6abdf4 - [mlir][AMDGPU] Remove buffer ops that are statically out of bounds

Mon Nov 21 08:47:26 PST 2022

Author: Krzysztof Drewniak
Date: 2022-11-21T16:47:21Z
New Revision: d6abdf46bc4d305f6046a8134316dd19cc6b5598

URL: https://github.com/llvm/llvm-project/commit/d6abdf46bc4d305f6046a8134316dd19cc6b5598
DIFF: https://github.com/llvm/llvm-project/commit/d6abdf46bc4d305f6046a8134316dd19cc6b5598.diff

LOG: [mlir][AMDGPU] Remove buffer ops that are statically out of bounds

When the bounds check attribute is true, the raw buffer load, store,
and atomic operations have well-defined behavior (returning 0 for
loads and ignoring stores) when the buffer access exceeds the bounds
of the memory being accessed.

Because of how LLVM currently implements these buffer operations (as
opaque intrinsics), the backend cannot optimize out this known
behavior and eliminate the memory operations. Therefore, use MLIR's
canonicalization system to eliminate these operations.

Reviewed By: nirvedhmeshram

Differential Revision: https://reviews.llvm.org/D138146

Added: 
    mlir/test/Dialect/AMDGPU/canonicalize.mlir

Modified: 
    mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
    mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
    mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
index 91b62bb85c494..f08f9fb59dee5 100644

--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -23,6 +23,10 @@ def AMDGPU_Dialect : Dialect {
     that will eventually be executed on AMD hardware.
   }];
 
+
+  let dependentDialects = [
+    "arith::ArithDialect"
+  ];
   let useDefaultAttributePrinterParser = 1;
 }
 
@@ -83,6 +87,7 @@ def AMDGPU_RawBufferLoadOp :
       (`sgprOffset` $sgprOffset^)? `:`
       type($memref) `,` type($indices) `->` type($value)
   }];
+  let hasCanonicalizer = 1;
   let hasVerifier = 1;
 }
 
@@ -124,6 +129,7 @@ def AMDGPU_RawBufferStoreOp :
       (`sgprOffset` $sgprOffset^)? `:`
       type($value) `->` type($memref) `,` type($indices)
   }];
+  let hasCanonicalizer = 1;
   let hasVerifier = 1;
 }
 
@@ -162,6 +168,7 @@ def AMDGPU_RawBufferAtomicFaddOp :
       (`sgprOffset` $sgprOffset^)? `:`
       type($value) `->` type($memref) `,` type($indices)
   }];
+  let hasCanonicalizer = 1;
   let hasVerifier = 1;
 }
 

diff  --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 26db766898c10..1e5ba7ade861e 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -12,14 +12,19 @@
 
 #include "mlir/Dialect/AMDGPU/AMDGPUDialect.h"
 
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Matchers.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "llvm/ADT/TypeSwitch.h"
 
+#include <limits>
+
 using namespace mlir;
 using namespace mlir::amdgpu;
 
@@ -62,6 +67,96 @@ LogicalResult RawBufferAtomicFaddOp::verify() {
   return verifyRawBufferOp(*this);
 }
 
+static Optional<uint32_t> getConstantUint32(Value v) {
+  APInt cst;
+  if (!v.getType().isInteger(32))
+    return None;
+  if (matchPattern(v, m_ConstantInt(&cst)))
+    return cst.getZExtValue();
+  return None;
+}
+
+template <typename OpType>
+static bool staticallyOutOfBounds(OpType op) {
+  if (!op.getBoundsCheck())
+    return false;
+  MemRefType bufferType = op.getMemref().getType();
+  if (!bufferType.hasStaticShape())
+    return false;
+  int64_t offset;
+  SmallVector<int64_t> strides;
+  if (failed(getStridesAndOffset(bufferType, strides, offset)))
+    return false;
+  int64_t result = offset + op.getIndexOffset().value_or(0);
+  if (op.getSgprOffset()) {
+    Optional<uint32_t> sgprOffset = getConstantUint32(op.getSgprOffset());
+    if (!sgprOffset)
+      return false;
+    result += *sgprOffset;
+  }
+  if (strides.size() != op.getIndices().size())
+    return false;
+  int64_t indexVal = 0;
+  for (auto pair : llvm::zip(strides, op.getIndices())) {
+    int64_t stride = std::get<0>(pair);
+    Value idx = std::get<1>(pair);
+    Optional<uint32_t> idxVal = getConstantUint32(idx);
+    if (!idxVal)
+      return false;
+    indexVal += stride * idxVal.value();
+  }
+  result += indexVal;
+  if (result > std::numeric_limits<uint32_t>::max())
+    // Overflow means don't drop
+    return false;
+  return result >= bufferType.getNumElements();
+}
+
+namespace {
+struct RemoveStaticallyOobBufferLoads final
+    : public OpRewritePattern<RawBufferLoadOp> {
+  using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(RawBufferLoadOp op,
+                                PatternRewriter &rw) const override {
+    if (!staticallyOutOfBounds(op))
+      return failure();
+    Type loadType = op.getResult().getType();
+    rw.replaceOpWithNewOp<arith::ConstantOp>(op, loadType,
+                                             rw.getZeroAttr(loadType));
+    return success();
+  }
+};
+
+template <typename OpType>
+struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
+  using OpRewritePattern<OpType>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
+    if (!staticallyOutOfBounds(op))
+      return failure();
+
+    rw.eraseOp(op);
+    return success();
+  }
+};
+} // end namespace
+
+void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                  MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferLoads>(context);
+}
+
+void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                                   MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferWrites<RawBufferStoreOp>>(context);
+}
+
+void RawBufferAtomicFaddOp::getCanonicalizationPatterns(
+    RewritePatternSet &results, MLIRContext *context) {
+  results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicFaddOp>>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // MFMAOp
 //===----------------------------------------------------------------------===//

diff  --git a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
index 1b80265baa90b..5dde478898d40 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/IR/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRAMDGPUDialect
   MLIRAMDGPUIncGen
 
   LINK_LIBS PUBLIC
+  MLIRArithDialect
   MLIRIR
   MLIRSideEffectInterfaces
   )

diff  --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
new file mode 100644
index 0000000000000..d984f8b06fda4
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -0,0 +1,132 @@
+// RUN: mlir-opt %s -split-input-file -canonicalize  | FileCheck %s
+
+// CHECK-LABEL: func @known_oob_load
+func.func @known_oob_load(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d
+func.func @known_oob_load_2d(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32, %c0_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_2d_on_last
+func.func @known_oob_load_2d_on_last(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %c16_i32 = arith.constant 16 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c16_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_index
+func.func @known_oob_load_index(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c0_i32 = arith.constant 0 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true, indexOffset = 4 : i32} %arg0[%c0_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @known_oob_load_sgproffset
+func.func @known_oob_load_sgproffset(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[zero:.*]] = arith.constant 0.000000e+00 : f32
+  // CHECK: return %[[zero]]
+  %c2_i32 = arith.constant 2 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c2_i32] sgprOffset %c2_i32 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load
+func.func @unknown_load(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%arg1] sgprOffset %c4_i32 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unknown_load_sgproffset
+func.func @unknown_load_sgproffset(%arg0: memref<4xf32>, %arg1: i32) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] sgprOffset %arg1 : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @unranked
+func.func @unranked(%arg0: memref<?xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c4_i32] : memref<?xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @no_oob_check
+func.func @no_oob_check(%arg0: memref<4xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c4_i32 = arith.constant 4 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = false} %arg0[%c4_i32] : memref<4xf32>, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @in_bounds_overall
+func.func @in_bounds_overall(%arg0: memref<4x4xf32>) -> f32 {
+  // CHECK: %[[loaded:.*]] = amdgpu.raw_buffer_load
+  // CHECK: return %[[loaded]]
+  %c0_i32 = arith.constant 0 : i32
+  %c15_i32 = arith.constant 15 : i32
+  %0 = amdgpu.raw_buffer_load {boundsCheck = true} %arg0[%c0_i32, %c15_i32] : memref<4x4xf32>, i32, i32 -> f32
+  func.return %0 : f32
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_store
+func.func @dead_store(%arg0: memref<4xf32>, %arg1: f32) {
+  // CHECK-NOT: amdgpu.raw_buffer_store
+  %c4_i32 = arith.constant 4 : i32
+  amdgpu.raw_buffer_store {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: func @dead_atomic_add
+func.func @dead_atomic_add(%arg0: memref<4xf32>, %arg1: f32) {
+  // CHECK-NOT: amdgpu.raw_buffer_atomic_fadd
+  %c4_i32 = arith.constant 4 : i32
+  amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %arg1 -> %arg0[%c4_i32] : f32 -> memref<4xf32>, i32
+  func.return
+}