[Mlir-commits] [mlir] [mlir][AMDGPU] Improve amdgpu.lds_barrier, add warnings (PR #77942)
Krzysztof Drewniak
llvmlistbot at llvm.org
Tue Feb 27 17:07:38 PST 2024
https://github.com/krzysz00 updated https://github.com/llvm/llvm-project/pull/77942
>From 6b1ff42abdac8f69b925722a23c216cdaaf48103 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Fri, 12 Jan 2024 09:59:12 -0600
Subject: [PATCH 1/2] [mlir][AMDGPU] Improve amdgpu.lds_barrier, add warnings
On some architectures (currently gfx90a, gfx94*, and gfx10**), we can
implement an LDS barrier using compiler intrinsics instead of inline
assembly, improving optimization possibilities and decreasing the
fragility of the underlying code.
Other AMDGPU chipsets continue to require inline assembly to implement
this barrier, as, by the default, the LLVM backend will insert waits
on global memory (s_waintcnt vmcnt(0)) before barriers in order to
ensure memory watchpoints set by debuggers work correctly.
Use of amdgpu.lds_barrier, on these architectures, imposes a tradeoff
between debugability and performance. The documentation, as well as
the generated inline assembly, have been updated to explicitly call
attention to this fact.
For chipsets that did not require the inline assembly hack, we move to
the s.waitcnt and s.barrier intrinsics, which have been added to the
ROCDL dialect. The magic constants used as an argument to the waitcnt
intrinsic can be derived from
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 5 ++
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 17 +++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 63 ++++++++++++++-----
.../AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 54 ++++++++++------
mlir/test/Dialect/LLVMIR/rocdl.mlir | 13 ++++
mlir/test/Target/LLVMIR/rocdl.mlir | 17 +++++
6 files changed, 136 insertions(+), 33 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index b4bf1b5191232d..21942b179a0013 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -441,6 +441,11 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
to complete before execution continues. Therefore, it should be used when
operations on global memory can be issued far in advance of when their results
are used (for example, by writing them to LDS).
+
+ WARNING: On architectures that do not support the BackOffBarrier feature,
+ (those which will implement this barrier by emitting inline assembly),
+ use of this operation will impede the usabiliity of memory watches (including
+ breakpoints set on variables) when debugging.
}];
let assemblyFormat = "attr-dict";
}
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 53e9f2dc6a9949..cdb50524d38b65 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -194,6 +194,23 @@ def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
//===----------------------------------------------------------------------===//
// Synchronization primitives
+// Emits the waintcnt instruction. The bitfield's semantics depend
+// on the target chipset
+def ROCDL_WaitcntOp : ROCDL_Op<"waitcnt">, Arguments<(ins I32:$bitfield)> {
+ string llvmBuilder = [{
+ createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_waitcnt,
+ {$bitfield});
+ }];
+ let assemblyFormat = "attr-dict $bitfield";
+}
+
+def ROCDL_SBarrierOp : ROCDL_Op<"s.barrier"> {
+ string llvmBuilder = [{
+ createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_barrier);
+ }];
+ let assemblyFormat = "attr-dict";
+}
+
def ROCDL_BarrierOp : ROCDL_Op<"barrier"> {
string llvmBuilder = [{
llvm::LLVMContext &llvmContext = builder.getContext();
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 12d2462061dcf0..57f3f84973762d 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -270,21 +270,55 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
};
struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
- using ConvertOpToLLVMPattern<LDSBarrierOp>::ConvertOpToLLVMPattern;
+ LDSBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
LogicalResult
matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
- auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
- LLVM::AsmDialect::AD_ATT);
- const char *asmStr = "s_waitcnt lgkmcnt(0)\ns_barrier";
- const char *constraints = "";
- rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
- op,
- /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
- /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
- /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
- /*operand_attrs=*/ArrayAttr());
+ bool requiresInlineAsm =
+ chipset.majorVersion < 9 ||
+ (chipset.majorVersion == 9 && chipset.minorVersion < 0x0a) ||
+ (chipset.majorVersion == 11);
+
+ if (requiresInlineAsm) {
+ auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
+ LLVM::AsmDialect::AD_ATT);
+ const char *asmStr =
+ ";;;WARNING: BREAKS DEBUG WATCHES\ns_waitcnt lgkmcnt(0)\ns_barrier";
+ const char *constraints = "";
+ rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
+ op,
+ /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
+ /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
+ /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
+ /*operand_attrs=*/ArrayAttr());
+ return success();
+ }
+ constexpr int32_t ldsOnlyBitsGfx6789 = ~(0x1f << 8);
+ constexpr int32_t ldsOnlyBitsGfx10 = ~(0x3f << 8);
+ // Left in place in case someone disables the inline ASM path or future
+ // chipsets use the same bit pattern.
+ constexpr int32_t ldsOnlyBitsGfx11 = ~(0x3f << 4);
+
+ int32_t ldsOnlyBits;
+ if (chipset.majorVersion == 11)
+ ldsOnlyBits = ldsOnlyBitsGfx11;
+ else if (chipset.majorVersion == 10)
+ ldsOnlyBits = ldsOnlyBitsGfx10;
+ else if (chipset.majorVersion <= 9)
+ ldsOnlyBits = ldsOnlyBitsGfx6789;
+ else
+ return op.emitOpError(
+ "don't know how to lower this for chipset major version")
+ << chipset.majorVersion;
+
+ Location loc = op->getLoc();
+ Value constant = createI32Constant(rewriter, loc, ldsOnlyBits);
+ rewriter.create<ROCDL::WaitcntOp>(loc, constant);
+ rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
return success();
}
};
@@ -834,7 +868,6 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
return converter.convertType(t.clone(IntegerType::get(t.getContext(), 16)));
});
- patterns.add<LDSBarrierOpLowering>(converter);
patterns
.add<RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawPtrBufferLoadOp>,
RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawPtrBufferStoreOp>,
@@ -848,9 +881,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
- MFMAOpLowering, WMMAOpLowering, ExtPackedFp8OpLowering,
- PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering>(converter,
- chipset);
+ LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
+ ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+ PackedStochRoundFp8OpLowering>(converter, chipset);
}
std::unique_ptr<Pass> mlir::createConvertAMDGPUToROCDLPass() {
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 76e42791323494..3db551470dbd21 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -1,12 +1,13 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefix=RDNA
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefix=RDNA
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx908 | FileCheck %s --check-prefixes=CHECK,GFX9,GFX908
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx90a | FileCheck %s --check-prefixes=CHECK,GFX9,GFX90A
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10,RDNA
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11,RDNA
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_scalar_i32
func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
// CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(4 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -19,7 +20,7 @@ func.func @gpu_gcn_raw_buffer_load_scalar_i32(%buf: memref<i32>) -> i32 {
func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
// CHECK: %[[stride:.*]] = llvm.mlir.constant(0 : i16)
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
// RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %[[stride]], %[[numRecords]], %[[flags]] : !llvm.ptr to <8>
// CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
@@ -30,11 +31,11 @@ func.func @gpu_gcn_raw_buffer_load_i32(%buf: memref<64xi32>, %idx: i32) -> i32 {
// CHECK-LABEL: func @gpu_gcn_raw_buffer_load_i32_oob_off
func.func @gpu_gcn_raw_buffer_load_i32_oob_off(%buf: memref<64xi32>, %idx: i32) -> i32 {
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
// RDNA: %[[flags:.*]] = llvm.mlir.constant(553807872 : i32)
- // RDNA: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
- // RDNA: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
- // RDNA: return %[[ret]]
+ // CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
+ // CHECK: %[[ret:.*]] = rocdl.raw.ptr.buffer.load %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
+ // CHECK: return %[[ret]]
%0 = amdgpu.raw_buffer_load {boundsCheck = false} %buf[%idx] : memref<64xi32>, i32 -> i32
func.return %0 : i32
}
@@ -103,7 +104,8 @@ func.func @gpu_gcn_raw_buffer_load_4xf8E4M3FNUZ(%buf: memref<64xf8E4M3FNUZ>, %id
// Since the lowering logic is shared with loads, only bitcasts need to be rechecked
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_scalar_i32
func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %{{.*}}, %[[flags]]
// CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[] : i32 -> memref<i32>
@@ -113,7 +115,8 @@ func.func @gpu_gcn_raw_buffer_store_scalar_i32(%value: i32, %buf: memref<i32>) {
// CHECK-LABEL: func @gpu_gcn_raw_buffer_store_i32
func.func @gpu_gcn_raw_buffer_store_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: rocdl.raw.ptr.buffer.store %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_store {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -140,7 +143,8 @@ func.func @gpu_gcn_raw_buffer_store_16xi8(%value: vector<16xi8>, %buf: memref<64
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fadd_f32
func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: rocdl.raw.ptr.buffer.atomic.fadd %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -150,7 +154,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>,
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_fmax_f32
func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: rocdl.raw.ptr.buffer.atomic.fmax %{{.*}}, %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : f32
amdgpu.raw_buffer_atomic_fmax {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
@@ -160,7 +165,8 @@ func.func @gpu_gcn_raw_buffer_atomic_fmax_f32(%value: f32, %buf: memref<64xf32>,
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_smax_i32
func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: rocdl.raw.ptr.buffer.atomic.smax %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_atomic_smax {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -170,7 +176,8 @@ func.func @gpu_gcn_raw_buffer_atomic_smax_i32(%value: i32, %buf: memref<64xi32>,
// CHECK-LABEL: func @gpu_gcn_raw_buffer_atomic_umin_i32
func.func @gpu_gcn_raw_buffer_atomic_umin_i32(%value: i32, %buf: memref<64xi32>, %idx: i32) {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: rocdl.raw.ptr.buffer.atomic.umin %{{.*}} %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
amdgpu.raw_buffer_atomic_umin {boundsCheck = true} %value -> %buf[%idx] : i32 -> memref<64xi32>, i32
@@ -183,7 +190,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
// CHECK: %[[srcCast:.*]] = llvm.bitcast %[[src]] : f32 to i32
// CHECK: %[[cmpCast:.*]] = llvm.bitcast %[[cmp]] : f32 to i32
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(256 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[srcCast]], %[[cmpCast]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i32
// CHECK: %[[dstCast:.*]] = llvm.bitcast %[[dst]] : i32 to f32
@@ -196,7 +204,8 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_f32(%src : f32, %cmp : f32, %buf : m
// CHECK-SAME: (%[[src:.*]]: i64, %[[cmp:.*]]: i64, {{.*}})
func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : memref<64xi64>, %idx: i32) -> i64 {
// CHECK: %[[numRecords:.*]] = llvm.mlir.constant(512 : i32)
- // CHECK: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // GFX9: %[[flags:.*]] = llvm.mlir.constant(159744 : i32)
+ // RDNA: %[[flags:.*]] = llvm.mlir.constant(822243328 : i32)
// CHECK: %[[resource:.*]] = rocdl.make.buffer.rsrc %{{.*}}, %{{.*}}, %[[numRecords]], %[[flags]]
// CHECK: %[[dst:.*]] = rocdl.raw.ptr.buffer.atomic.cmpswap %[[src]], %[[cmp]], %[[resource]], %{{.*}}, %{{.*}}, %{{.*}} : i64
// CHECK: return %[[dst]]
@@ -206,7 +215,16 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : m
// CHECK-LABEL: func @lds_barrier
func.func @lds_barrier() {
- // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
+ // GFX908: llvm.inline_asm has_side_effects asm_dialect = att
+ // GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
+ // GFX90A: %[[cst:.*]] = llvm.mlir.constant(-7937 : i32) : i32
+ // GFX90A: rocdl.waitcnt %[[cst]]
+ // GFX90A-NEXT: rocdl.s.barrier
+ // GFX10: %[[cst:.*]] = llvm.mlir.constant(-16129 : i32) : i32
+ // GFX10: rocdl.waitcnt %[[cst]]
+ // GFX10-NEXT: rocdl.s.barrier
+ // GFX11: llvm.inline_asm has_side_effects asm_dialect = att
+ // GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
amdgpu.lds_barrier
func.return
}
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 89e8e7836c3a0c..653edbcee02a38 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -363,6 +363,19 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
llvm.return %source5 : i32
}
+llvm.func @rocdl.waitcnt(%arg0 : i32) {
+ // CHECK-LABEL: rocdl.waitcnt
+ // CHECK: rocdl.waitcnt
+ rocdl.waitcnt %arg0
+ llvm.return
+}
+
+llvm.func @rocdl.s.barrier() {
+ // CHECK-LABEL: rocdl.s.barrier
+ // CHECK: rocdl.s.barrier
+ rocdl.s.barrier
+ llvm.return
+}
// -----
// expected-error at below {{attribute attached to unexpected op}}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3ea6292c679d90..7d34b876dbbd42 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -88,7 +88,24 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
llvm.return %0 : i32
}
+llvm.func @rocdl.waitcnt() {
+ // CHECK-LABEL: rocdl.waitcnt
+ // CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
+ %0 = llvm.mlir.constant(0 : i32) : i32
+ rocdl.waitcnt %0
+ llvm.return
+}
+
+llvm.func @rocdl.s.barrier() {
+ // CHECK-LABEL: rocdl.s.barrier
+ // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
+ rocdl.s.barrier
+ llvm.return
+}
+
+
llvm.func @rocdl.barrier() {
+ // CHECK-LABEL: rocdl.barrier
// CHECK: fence syncscope("workgroup") release
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
// CHECK-NEXT: fence syncscope("workgroup") acquire
>From de088604503ba7564a19ccb5ecee6c7f67f813fc Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak at amd.com>
Date: Wed, 28 Feb 2024 01:07:05 +0000
Subject: [PATCH 2/2] Change waitcnt to take an attribute because it's an
immarg in LLVM
---
mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 4 ++--
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 3 +--
mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 6 ++----
mlir/test/Dialect/LLVMIR/rocdl.mlir | 6 +++---
mlir/test/Target/LLVMIR/rocdl.mlir | 3 +--
5 files changed, 9 insertions(+), 13 deletions(-)
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index cdb50524d38b65..32b5a1c016b6f8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -196,10 +196,10 @@ def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z",
// Emits the waintcnt instruction. The bitfield's semantics depend
// on the target chipset
-def ROCDL_WaitcntOp : ROCDL_Op<"waitcnt">, Arguments<(ins I32:$bitfield)> {
+def ROCDL_WaitcntOp : ROCDL_Op<"waitcnt">, Arguments<(ins I32Attr:$bitfield)> {
string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_waitcnt,
- {$bitfield});
+ {builder.getInt32($bitfield)});
}];
let assemblyFormat = "attr-dict $bitfield";
}
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 57f3f84973762d..7e073bae75c0c9 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -316,8 +316,7 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
<< chipset.majorVersion;
Location loc = op->getLoc();
- Value constant = createI32Constant(rewriter, loc, ldsOnlyBits);
- rewriter.create<ROCDL::WaitcntOp>(loc, constant);
+ rewriter.create<ROCDL::WaitcntOp>(loc, ldsOnlyBits);
rewriter.replaceOpWithNewOp<ROCDL::SBarrierOp>(op);
return success();
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 3db551470dbd21..bb1cedaa276b33 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -217,11 +217,9 @@ func.func @amdgpu_raw_buffer_atomic_cmpswap_i64(%src : i64, %cmp : i64, %buf : m
func.func @lds_barrier() {
// GFX908: llvm.inline_asm has_side_effects asm_dialect = att
// GFX908-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
- // GFX90A: %[[cst:.*]] = llvm.mlir.constant(-7937 : i32) : i32
- // GFX90A: rocdl.waitcnt %[[cst]]
+ // GFX90A: rocdl.waitcnt -7937
// GFX90A-NEXT: rocdl.s.barrier
- // GFX10: %[[cst:.*]] = llvm.mlir.constant(-16129 : i32) : i32
- // GFX10: rocdl.waitcnt %[[cst]]
+ // GFX10: rocdl.waitcnt -16129
// GFX10-NEXT: rocdl.s.barrier
// GFX11: llvm.inline_asm has_side_effects asm_dialect = att
// GFX11-SAME: ";;;WARNING: BREAKS DEBUG WATCHES\0As_waitcnt lgkmcnt(0)\0As_barrier"
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 653edbcee02a38..6519186d2cfdcc 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -363,10 +363,10 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
llvm.return %source5 : i32
}
-llvm.func @rocdl.waitcnt(%arg0 : i32) {
+llvm.func @rocdl.waitcnt() {
// CHECK-LABEL: rocdl.waitcnt
- // CHECK: rocdl.waitcnt
- rocdl.waitcnt %arg0
+ // CHECK: rocdl.waitcnt 0
+ rocdl.waitcnt 0
llvm.return
}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 7d34b876dbbd42..d35acb0475e6f8 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -91,8 +91,7 @@ llvm.func @rocdl.bpermute(%src : i32) -> i32 {
llvm.func @rocdl.waitcnt() {
// CHECK-LABEL: rocdl.waitcnt
// CHECK-NEXT: call void @llvm.amdgcn.s.waitcnt(i32 0)
- %0 = llvm.mlir.constant(0 : i32) : i32
- rocdl.waitcnt %0
+ rocdl.waitcnt 0
llvm.return
}
More information about the Mlir-commits
mailing list