[Mlir-commits] [mlir] [MLIR][AMDGPU] Add amdgpu.sched_barrier (PR #98911)
Manupa Karunaratne
llvmlistbot at llvm.org
Mon Jul 15 08:13:20 PDT 2024
https://github.com/manupak updated https://github.com/llvm/llvm-project/pull/98911
>From 20af1ecbfc29f5db66c1067eb25eb29385d28202 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne at amd.com>
Date: Mon, 15 Jul 2024 08:30:42 -0500
Subject: [PATCH] [MLIR][AMDGPU] Add amdgpu.sched_barrier
This commit adds sched_barrier operator
to AMDGPU dialect that lowers to rocdl.sched.barrier.
---
mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 40 +++++++++++++++++++
.../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 20 +++++++++-
.../AMDGPUToROCDL/amdgpu-to-rocdl.mlir | 31 ++++++++++++++
mlir/test/Dialect/AMDGPU/ops.mlir | 9 +++++
4 files changed, 98 insertions(+), 2 deletions(-)
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 3f27e1541cf38..60fe20b002999 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -433,6 +433,46 @@ def AMDGPU_LDSBarrierOp : AMDGPU_Op<"lds_barrier"> {
let assemblyFormat = "attr-dict";
}
+def AMDGPU_SchedBarrierOpOpt : I32BitEnumAttr<"sched_barrier_opt_enum",
+ "The possible options for scheduling barriers",
+ [
+ I32BitEnumAttrCaseBit<"allow_none", 0>,
+ I32BitEnumAttrCaseBit<"allow_non_mem_non_sideffect", 1>,
+ I32BitEnumAttrCaseBit<"allow_valu", 2>,
+ I32BitEnumAttrCaseBit<"allow_salu", 3>,
+ I32BitEnumAttrCaseBit<"allow_mfma_wmma", 4>,
+ I32BitEnumAttrCaseBit<"allow_all_vmem", 5>,
+ I32BitEnumAttrCaseBit<"allow_vmem_read", 6>,
+ I32BitEnumAttrCaseBit<"allow_vmem_write", 7>,
+ I32BitEnumAttrCaseBit<"allow_all_ds", 8>,
+ I32BitEnumAttrCaseBit<"allow_ds_read", 9>,
+ I32BitEnumAttrCaseBit<"allow_ds_write", 10>,
+ I32BitEnumAttrCaseBit<"allow_transcendental", 11>
+ ]> {
+ let genSpecializedAttr = 0;
+ let cppNamespace = "::mlir::amdgpu";
+}
+
+def AMDGPU_SchedBarrierOpOptAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_SchedBarrierOpOpt,
+ "sched_barrier_opt">{
+ let assemblyFormat = "`<` $value `>`";
+}
+
+def AMDGPU_SchedBarrierOp :
+ AMDGPU_Op<"sched_barrier">,
+ Arguments<(ins AMDGPU_SchedBarrierOpOptAttr:$opts)>
+ {
+ let summary = "Barrier that limits the backend scheduler of instruction movement";
+ let description = [{
+ `amdgpu.sched_barrier` serves as a barrier that could be
+ configured to restrict movements of instructions through it as
+ defined by sched_barrier_opts.
+ }];
+ let assemblyFormat = [{
+ $opts attr-dict
+ }];
+}
+
def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
"The possible permutations of the lanes storing B available in an MFMA",
[
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 033e66c6118f3..b808738804030 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -321,6 +321,22 @@ struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
return success();
}
};
+
+struct SchedBarrierOpLowering : public ConvertOpToLLVMPattern<SchedBarrierOp> {
+ SchedBarrierOpLowering(LLVMTypeConverter &converter, Chipset chipset)
+ : ConvertOpToLLVMPattern<SchedBarrierOp>(converter), chipset(chipset) {}
+
+ Chipset chipset;
+
+ LogicalResult
+ matchAndRewrite(SchedBarrierOp op, SchedBarrierOp::Adaptor adaptor,
+ ConversionPatternRewriter &rewriter) const override {
+ rewriter.replaceOpWithNewOp<ROCDL::SchedBarrier>(op,
+ (uint32_t)op.getOpts());
+ return success();
+ }
+};
+
} // namespace
/// If `input` is a vector of bytes, concatentate those bytes in little-endian
@@ -879,8 +895,8 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
ROCDL::RawPtrBufferAtomicUminOp>,
RawBufferOpLowering<RawBufferAtomicCmpswapOp,
ROCDL::RawPtrBufferAtomicCmpSwap>,
- LDSBarrierOpLowering, MFMAOpLowering, WMMAOpLowering,
- ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
+ LDSBarrierOpLowering, SchedBarrierOpLowering, MFMAOpLowering,
+ WMMAOpLowering, ExtPackedFp8OpLowering, PackedTrunc2xFp8OpLowering,
PackedStochRoundFp8OpLowering>(converter, chipset);
}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
index bb1cedaa276b3..b034722ae3cca 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/amdgpu-to-rocdl.mlir
@@ -226,3 +226,34 @@ func.func @lds_barrier() {
amdgpu.lds_barrier
func.return
}
+
+// CHECK-LABEL: func @sched_barrier
+func.func @sched_barrier() {
+ // rocdl.sched.barrier 0
+ amdgpu.sched_barrier <allow_none>
+ // rocdl.sched.barrier 1
+ amdgpu.sched_barrier <allow_non_mem_non_sideffect>
+ // rocdl.sched.barrier 2
+ amdgpu.sched_barrier <allow_valu>
+ // rocdl.sched.barrier 4
+ amdgpu.sched_barrier <allow_salu>
+ // rocdl.sched.barrier 8
+ amdgpu.sched_barrier <allow_mfma_wmma>
+ // rocdl.sched.barrier 16
+ amdgpu.sched_barrier <allow_all_vmem>
+ // rocdl.sched.barrier 32
+ amdgpu.sched_barrier <allow_vmem_read>
+ // rocdl.sched.barrier 64
+ amdgpu.sched_barrier <allow_vmem_write>
+ // rocdl.sched.barrier 128
+ amdgpu.sched_barrier <allow_all_ds>
+ // rocdl.sched.barrier 256
+ amdgpu.sched_barrier <allow_ds_read>
+ // rocdl.sched.barrier 512
+ amdgpu.sched_barrier <allow_ds_write>
+ // rocdl.sched.barrier 1024
+ amdgpu.sched_barrier <allow_transcendental>
+ // rocdl.sched.barrier 18
+ amdgpu.sched_barrier <allow_valu|allow_all_vmem>
+ func.return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 744a096d757e0..33a9f6b94aa13 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -109,6 +109,15 @@ func.func @lds_barrier() {
func.return
}
+// CHECK-LABEL: func @sched_barrier
+func.func @sched_barrier() {
+ // CHECK: amdgpu.sched_barrier <allow_none>
+ amdgpu.sched_barrier <allow_none>
+ // CHECK: amdgpu.sched_barrier <allow_valu|allow_all_vmem>
+ amdgpu.sched_barrier <allow_valu|allow_all_vmem>
+ func.return
+}
+
// CHECK-LABEL: func @mfma
func.func @mfma(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> {
// CHECK: amdgpu.mfma
More information about the Mlir-commits
mailing list