[Mlir-commits] [mlir] bc61cc9 - [mlir][AMDGPU] Add lds_barrier op
Krzysztof Drewniak
llvmlistbot at llvm.org
Thu Jul 14 13:45:30 PDT 2022
Author: Krzysztof Drewniak
Date: 2022-07-14T20:45:26Z
New Revision: bc61cc9a2db56eeb5fd299132037757da339aebd
URL: https://github.com/llvm/llvm-project/commit/bc61cc9a2db56eeb5fd299132037757da339aebd
DIFF: https://github.com/llvm/llvm-project/commit/bc61cc9a2db56eeb5fd299132037757da339aebd.diff
LOG: [mlir][AMDGPU] Add lds_barrier op
The lds_barrier op allows workgroups to wait at a barrier for
operations to/from their local data store (LDS) to complete without
incurring the performance penalties of a full memory fence.
Reviewed By: nirvedhmeshram
Differential Revision: https://reviews.llvm.org/D129522
Added:
Modified:
mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
mlir/test/Dialect/AMDGPU/ops.mlir
Removed:
################################################################################
diff --git a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
index 2caee489b1244..ef54628cb81c9 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/AMDGPU.td
@@ -164,4 +164,23 @@ def AMDGPU_RawBufferAtomicFaddOp :
let hasVerifier = 1;
}
+def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
+ let summary = "Barrier that includes a wait for LDS memory operations.";
+ let description = [{
+ `amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach
+ the barrier before any of them may proceed past it) and a wait for all
+ operations that affect the Local Data Store (LDS) issued from that wrokgroup
+ to complete before the workgroup may continue. Since the LDS is per-workgroup
+ memory, this barrier may be used, for example, to ensure all workitems have
+ written data to LDS before any workitem attempts to read from it.
+
+ Note that `lds_barrier` does **not** force reads to or from global memory
+ to complete before execution continues. Therefore, it should be used when
+ operations on global memory can be issued far in advance of when their results
+ are used (for example, by writing them to LDS).
+ }];
+ let assemblyFormat = "attr-dict";
+}
+
+
#endif // AMDGPU
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index c0da60a27e696..1867df2d8e85c 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -241,6 +241,26 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
}
};
+struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
+ using ConvertOpToLLVMPattern<LDSBarrierOp>::ConvertOpToLLVMPattern;
+
+ LogicalResult
+ matchAndRewrite(LDSBarrierOp op, LDSBarrierOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
+ LLVM::AsmDialect::AD_ATT);
+ const char *asmStr = "s_waitcnt lgkmcnt(0)\ns_barrier";
+ const char *constraints = "";
+ rewriter.replaceOpWithNewOp<LLVM::InlineAsmOp>(
+ op,
+ /*resultTypes=*/TypeRange(), /*operands=*/ValueRange(),
+ /*asm_string=*/asmStr, constraints, /*has_side_effects=*/true,
+ /*is_align_stack=*/false, /*asm_dialect=*/asmDialectAttr,
+ /*operand_attrs=*/ArrayAttr());
+ return success();
+ }
+};
+
struct ConvertAMDGPUToROCDLPass
: public ConvertAMDGPUToROCDLBase<ConvertAMDGPUToROCDLPass> {
ConvertAMDGPUToROCDLPass() = default;
@@ -269,6 +289,7 @@ struct ConvertAMDGPUToROCDLPass
void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
Chipset chipset) {
+ patterns.add<LDSBarrierOpLowering>(converter);
patterns.add<
RawBufferOpLowering<RawBufferLoadOp, ROCDL::RawBufferLoadOp>,
RawBufferOpLowering<RawBufferStoreOp, ROCDL::RawBufferStoreOp>,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index 129ebe668ea67..e9a999d26b147 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -101,3 +101,10 @@ func.func @gpu_gcn_raw_buffer_atomic_fadd_f32(%value: f32, %buf: memref<64xf32>,
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true} %value -> %buf[%idx] : f32 -> memref<64xf32>, i32
func.return
}
+
+// CHECK-LABEL: func @lds_barrier
+func.func @lds_barrier() {
+ // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "s_waitcnt lgkmcnt(0)\0As_barrier"
+ amdgpu.lds_barrier
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index daf6b7a0dae0b..3fff10c666ba2 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -59,3 +59,10 @@ func.func @raw_buffer_atomic_fadd_f32_to_rank_4(%value : f32, %dst : memref<128x
amdgpu.raw_buffer_atomic_fadd {boundsCheck = true, indexOffset = 1 : i32} %value -> %dst[%idx0, %idx1, %idx2, %idx3] sgprOffset %offset : f32 -> memref<128x64x32x16xf32>, i32, i32, i32, i32
func.return
}
+
+// CHECK-LABEL: func @lds_barrier
+func.func @lds_barrier() {
+ // CHECK: amdgpu.lds_barrier
+ amdgpu.lds_barrier
+ func.return
+}
More information about the Mlir-commits
mailing list