[Mlir-commits] [mlir] 98c1104 - [mlir][AMDGPU] Define atomic compare-and-swap for raw buffers
Krzysztof Drewniak
llvmlistbot at llvm.org
Wed May 3 14:11:25 PDT 2023
Author: Krzysztof Drewniak
Date: 2023-05-03T21:11:20Z
New Revision: 98c1104d41efae7cee6b0f2c833367c1c7231c4c
URL: https://github.com/llvm/llvm-project/commit/98c1104d41efae7cee6b0f2c833367c1c7231c4c
DIFF: https://github.com/llvm/llvm-project/commit/98c1104d41efae7cee6b0f2c833367c1c7231c4c.diff
LOG: [mlir][AMDGPU] Define atomic compare-and-swap for raw buffers
This commit adds the buffer cmpswap intrinsic to the ROCDL dialect and
its corresponding AMDGPU dialect wrappers.
Reviewed By: nirvedhmeshram
Differential Revision: https://reviews.llvm.org/D148722
Added:
Modified:
mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
mlir/test/Dialect/AMDGPU/ops.mlir
mlir/test/Dialect/LLVMIR/rocdl.mlir
mlir/test/Target/LLVMIR/rocdl.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
index 3589fa607b72d..63c15ac537d15 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -136,6 +136,48 @@ def AMDGPU_RawBufferStoreOp :
let hasVerifier = 1;
}
+// Raw buffer atomic compare-and-swap
+def AMDGPU_RawBufferAtomicCmpswapOp :
+ AMDGPU_Op<"raw_buffer_atomic_cmpswap", [
+ AttrSizedOperandSegments,
+ AllTypesMatch<["src", "cmp", "value"]>,
+ AllElementTypesMatch<["value", "memref"]>]>,
+ Arguments<(ins AnyTypeOf<[I32, I64, F32, F64]>:$src,
+ AnyType:$cmp,
+ Arg<AnyMemRef, "buffer to operate on", [MemRead, MemWrite]>:$memref,
+ Variadic<I32>:$indices,
+ DefaultValuedAttr<BoolAttr, "true">:$boundsCheck,
+ OptionalAttr<I32Attr>:$indexOffset,
+ Optional<I32>:$sgprOffset)>,
+ Results<(outs AnyType:$value)> {
+
+ let summary = "Raw Buffer Atomic compare-and-swap";
+ let description = [{
+ The `amdgpu.raw_buffer_atomic_cmpswap` op is a wrapper around the
+ buffer-based atomic compare-and-swap min available on AMD GPUs.
+
+ The index into the buffer is computed as for `memref.store` with the addition
+ of `indexOffset` (which is used to aid in emitting vectorized code) and,
+ if present `sgprOffset` (which is added after bounds checks and includes
+ any non-zero offset on the memref type).
+
+ All indexing components are given in terms of the memref's element size, not
+ the byte lengths required by the intrinsic.
+
+ Out of bounds atomic operations are ignored in hardware.
+
+ See `amdgpu.raw_buffer_load` for a description of how the underlying
+ instruction is constructed.
+ }];
+ let assemblyFormat = [{
+ attr-dict $src `,` $cmp `->` $memref `[` $indices `]`
+ (`sgprOffset` $sgprOffset^)? `:`
+ type($value) `->` type($memref) `,` type($indices)
+ }];
+ let hasCanonicalizer = 1;
+ let hasVerifier = 1;
+}
+
// Raw buffer atomic floating point add
def AMDGPU_RawBufferAtomicFaddOp :
AMDGPU_Op<"raw_buffer_atomic_fadd", [AllElementTypesMatch<["value", "memref"]>,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 470fdfef4f1ee..3187ce1615ff9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -252,6 +252,25 @@ def ROCDL_RawBufferStoreOp :
let hasCustomAssemblyFormat = 1;
}
+def ROCDL_RawBufferAtomicCmpSwap :
+ ROCDL_Op<"raw.buffer.atomic.cmpswap", [AllTypesMatch<["res", "src", "cmp"]>]>,
+ Results<(outs LLVM_Type:$res)>,
+ Arguments<(ins LLVM_Type:$src,
+ LLVM_Type:$cmp,
+ LLVM_Type:$rsrc,
+ I32:$offset,
+ I32:$soffset,
+ I32:$aux)>{
+ string llvmBuilder = [{
+ $res = createIntrinsicCall(builder,
+ llvm::Intrinsic::amdgcn_raw_buffer_atomic_cmpswap, {$src, $cmp, $rsrc,
+ $offset, $soffset, $aux}, {$_resultType});
+ }];
+ let assemblyFormat = [{
+ attr-dict `(` operands `)` `:` type($res) `,` type($rsrc)
+ }];
+}
+
//===---------------------------------------------------------------------===//
// MI-100 and MI-200 buffer atomic floating point add intrinsic
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index d4ee6e7e5d347..015b7164421a1 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -62,6 +62,14 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
else
wantedDataType = gpuOp.getODSResults(0)[0].getType();
+ Value atomicCmpData = Value();
+ // Operand index 1 of a load is the indices, trying to read them can crash.
+ if (storeData) {
+ Value maybeCmpData = adaptor.getODSOperands(1)[0];
+ if (maybeCmpData != memref)
+ atomicCmpData = maybeCmpData;
+ }
+
Type llvmWantedDataType = this->typeConverter->convertType(wantedDataType);
Type i32 = rewriter.getI32Type();
@@ -73,8 +81,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
// If we want to load a vector<NxT> with total size <= 32
// bits, use a scalar load and bitcast it. Similarly, if bitsize(T) < 32
// and the total load size is >= 32, use a vector load of N / (bitsize(T) /
- // 32) x i32 and bitcast.
+ // 32) x i32 and bitcast. Also, the CAS intrinsic requires integer operands,
+ // so bitcast any floats to integers.
Type llvmBufferValType = llvmWantedDataType;
+ if (atomicCmpData) {
+ if (wantedDataType.isa<VectorType>())
+ return gpuOp.emitOpError("vector compare-and-swap does not exist");
+ if (auto floatType = wantedDataType.dyn_cast<FloatType>())
+ llvmBufferValType = this->getTypeConverter()->convertType(
+ rewriter.getIntegerType(floatType.getWidth()));
+ }
if (auto dataVector = wantedDataType.dyn_cast<VectorType>()) {
uint32_t elemBits = dataVector.getElementTypeBitWidth();
uint32_t totalBits = elemBits * dataVector.getNumElements();
@@ -109,6 +125,16 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
}
+ if (atomicCmpData) {
+ if (llvmBufferValType != llvmWantedDataType) {
+ Value castForCmp = rewriter.create<LLVM::BitcastOp>(
+ loc, llvmBufferValType, atomicCmpData);
+ args.push_back(castForCmp);
+ } else {
+ args.push_back(atomicCmpData);
+ }
+ }
+
// Construct buffer descriptor from memref, attributes
int64_t offset = 0;
SmallVector<int64_t, 5> strides;
@@ -529,6 +555,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
RawBufferOpLowering<RawBufferAtomicFmaxOp, ROCDL::RawBufferAtomicFMaxOp>,
RawBufferOpLowering<RawBufferAtomicSmaxOp, ROCDL::RawBufferAtomicSMaxOp>,
RawBufferOpLowering<RawBufferAtomicUminOp, ROCDL::RawBufferAtomicUMinOp>,
+ RawBufferOpLowering<RawBufferAtomicCmpswapOp,
+ ROCDL::RawBufferAtomicCmpSwap>,
MFMAOpLowering>(converter, chipset);
}
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 9150e7368849d..b66a4b29fcc2a 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -90,6 +90,10 @@ LogicalResult RawBufferAtomicUminOp::verify() {
return verifyRawBufferOp(*this);
}
+LogicalResult RawBufferAtomicCmpswapOp::verify() {
+ return verifyRawBufferOp(*this);
+}
+
static std::optional<uint32_t> getConstantUint32(Value v) {
APInt cst;
if (!v.getType().isInteger(32))
@@ -136,12 +140,11 @@ static bool staticallyOutOfBounds(OpType op) {
}
namespace {
-struct RemoveStaticallyOobBufferLoads final
- : public OpRewritePattern<RawBufferLoadOp> {
- using OpRewritePattern<RawBufferLoadOp>::OpRewritePattern;
+template <typename OpType>
+struct RemoveStaticallyOobBufferLoads final : public OpRewritePattern<OpType> {
+ using OpRewritePattern<OpType>::OpRewritePattern;
- LogicalResult matchAndRewrite(RawBufferLoadOp op,
- PatternRewriter &rw) const override {
+ LogicalResult matchAndRewrite(OpType op, PatternRewriter &rw) const override {
if (!staticallyOutOfBounds(op))
return failure();
Type loadType = op.getResult().getType();
@@ -167,7 +170,7 @@ struct RemoveStaticallyOobBufferWrites final : public OpRewritePattern<OpType> {
void RawBufferLoadOp::getCanonicalizationPatterns(RewritePatternSet &results,
MLIRContext *context) {
- results.add<RemoveStaticallyOobBufferLoads>(context);
+ results.add<RemoveStaticallyOobBufferLoads<RawBufferLoadOp>>(context);
}
void RawBufferStoreOp::getCanonicalizationPatterns(RewritePatternSet &results,
@@ -195,6 +198,12 @@ void RawBufferAtomicUminOp::getCanonicalizationPatterns(
results.add<RemoveStaticallyOobBufferWrites<RawBufferAtomicUminOp>>(context);
}
+void RawBufferAtomicCmpswapOp::getCanonicalizationPatterns(
+ RewritePatternSet &results, MLIRContext *context) {
+ results.add<RemoveStaticallyOobBufferLoads<RawBufferAtomicCmpswapOp>>(
+ context);
+}
+
//===----------------------------------------------------------------------===//
// MFMAOp
//===----------------------------------------------------------------------===//
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index a26add21cefb6..1dbb70b015337 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -197,6 +197,35 @@ func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>,
func.return
}
+// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_f32
+// CHECK-SAME: (%[[src:.*]]: f32, %[[cmp:.*]]: f32, {{.*}})
+func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : memref<64xf32>, %idx: i32) -> f32 {
+ // CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
+ // CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
+ // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
+ // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+ // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+ // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
+ // CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
+ // CHECK: return %[[dstCast]]
+ %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : f32 -> memref<64xf32>, i32
+ func.return %dst : f32
+}
+
+// CHECK-LABEL: func @amdgpu_raw_buffer_atomic_cmpswap_i64
+// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
+func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
+ // CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
+ // CHECK: llvm.insertelement{{.*}}%[[numRecords]]
+ // CHECK: %[[word3:.*]] = llvm.mlir.constant(159744 : i32)
+ // CHECK: %[[resource:.*]] = llvm.insertelement{{.*}}%[[word3]]
+ // CHECK: %[[dst:.*]] = rocdl.raw.buffer.atomic.cmpswap(%[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}}) : i64, vector<4xi32>
+ // CHECK: return %[[dst]]
+ %dst = amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true} %src, %cmp -> %buf[%idx] : i64 -> memref<64xi64>, i32
+ func.return %dst : i64
+}
+
// CHECK-LABEL: func @lds_barrier
func.func @lds_barrier() {
// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 80bd7d4655e8b..a612fba56a763 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -74,6 +74,13 @@ func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x
func.return
}
+// CHECK-LABEL: func @raw_buffer_atomic_cmpswap_f32
+func.func @raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %dst : memref<128x64x32x16xf32>, %offset : i32, %idx0 : i32, %idx1 : i32, %idx2 : i32, %idx3 : i32) {
+ // CHECK: amdgpu.raw_buffer_atomic_cmpswap {indexOffset = 1 : i32} %{{.*}}, %{{.*}} -> %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}] sgprOffset %{{.*}} : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+ amdgpu.raw_buffer_atomic_cmpswap {boundsCheck = true, indexOffset = 1 : i32} %src, %cmp -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
+ func.return
+}
+
// CHECK-LABEL: func @lds_barrier
func.func @lds_barrier() {
// CHECK: amdgpu.lds_barrier
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 200d3e9f7d757..11605afc7d16a 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -262,9 +262,11 @@ llvm.func @rocdl.raw.buffer.i32(%rsrc : vector<4xi32>,
// CHECK-LABEL: rocdl.raw.buffer.i32
// CHECK: rocdl.raw.buffer.atomic.smax %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32
// CHECK: rocdl.raw.buffer.atomic.umin %{{.*}} %{{.*}} %{{.*}} %{{.*}} %{{.*}} : i32
+ // CHECK: %{{.*}} = rocdl.raw.buffer.atomic.cmpswap(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) : i32, vector<4xi32>
rocdl.raw.buffer.atomic.smax %vdata1, %rsrc, %offset, %soffset, %aux : i32
rocdl.raw.buffer.atomic.umin %vdata1, %rsrc, %offset, %soffset, %aux : i32
+ %val = rocdl.raw.buffer.atomic.cmpswap(%vdata1, %vdata1, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32>
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 1a65967974e87..e3f942e8bd785 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -294,6 +294,18 @@ llvm.func @rocdl.raw.buffer.atomic.i32(%rsrc : vector<4xi32>,
llvm.return
}
+llvm.func @rocdl.raw.buffer.atomic.cmpswap(%rsrc : vector<4xi32>,
+ %offset : i32, %soffset : i32,
+ %src : i32, %cmp : i32) -> i32 {
+ %aux = llvm.mlir.constant(0 : i32) : i32
+ // CHECK-LABEL: rocdl.raw.buffer.atomic.cmpswap
+ // CHECK: [[val:%.+]] = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 %{{.*}}, i32 %{{.*}}, <4 x i32> %{{.*}}, i32 %{{.*}}, i32 %{{.*}}, i32 {{.*}}
+ // CHECK: ret i32 [[val]]
+
+ %val = rocdl.raw.buffer.atomic.cmpswap(%src, %cmp, %rsrc, %offset, %soffset, %aux) : i32, vector<4xi32>
+ llvm.return %val : i32
+}
+
// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
More information about the Mlir-commits
mailing list