[clang] [llvm] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to support rules. (PR #85304)
Jeffrey Byrnes via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 14 13:07:00 PDT 2024
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/85304
I am still working with the user to define the actual rules, so it is still a WIP. However, the current version contains the main machinery of the feature.
This helps bridge the gap between sched_group_barrier and iglp_opt, enabling users (with compiler support) more ability to create the pipelines they want.
In particular, this is aimed at helping control scheduling in blocks with loop-carried dependencies. Since this is a global scheduling problem, there is no straightforward way to tune the scheduler against these blocks.
>From 04dc59ff7757dea18e2202d1cbff1d675885fdae Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 12 Mar 2024 10:22:24 -0700
Subject: [PATCH] [AMDGPU] Extend __builtin_amdgcn_sched_group_barrier to
support rules.
Change-Id: Id8460dc42f41575760793c0fc70e0bc0aecc0d5e
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +-
clang/lib/CodeGen/CGBuiltin.cpp | 17 +++
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 14 +++
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 15 ++-
llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 112 ++++++++++++++++--
llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp | 14 +++
llvm/lib/Target/AMDGPU/SIInstructions.td | 16 +++
llvm/lib/Target/AMDGPU/SIPostRABundler.cpp | 3 +-
.../AMDGPU/llvm.amdgcn.sched.group.barrier.ll | 25 ++++
9 files changed, 202 insertions(+), 16 deletions(-)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 61ec8b79bf054d..f7b6a4610bd80a 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -63,7 +63,7 @@ BUILTIN(__builtin_amdgcn_s_sendmsghalt, "vIiUi", "n")
BUILTIN(__builtin_amdgcn_s_barrier, "v", "n")
BUILTIN(__builtin_amdgcn_wave_barrier, "v", "n")
BUILTIN(__builtin_amdgcn_sched_barrier, "vIi", "n")
-BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi", "n")
+BUILTIN(__builtin_amdgcn_sched_group_barrier, "vIiIiIi.", "n")
BUILTIN(__builtin_amdgcn_iglp_opt, "vIi", "n")
BUILTIN(__builtin_amdgcn_s_dcache_inv, "v", "n")
BUILTIN(__builtin_amdgcn_buffer_wbinvl1, "v", "n")
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 528a13fb275124..4bf71c7535db63 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18761,6 +18761,23 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_grid_size_z:
return EmitAMDGPUGridSize(*this, 2);
+ // scheduling builtins
+ case AMDGPU::BI__builtin_amdgcn_sched_group_barrier: {
+ return E->getNumArgs() == 3
+ ? Builder.CreateCall(
+ CGM.getIntrinsic(Intrinsic::amdgcn_sched_group_barrier),
+ {EmitScalarExpr(E->getArg(0)),
+ EmitScalarExpr(E->getArg(1)),
+ EmitScalarExpr(E->getArg(2))})
+ : Builder.CreateCall(
+ CGM.getIntrinsic(
+ Intrinsic::amdgcn_sched_group_barrier_rule),
+ {EmitScalarExpr(E->getArg(0)),
+ EmitScalarExpr(E->getArg(1)),
+ EmitScalarExpr(E->getArg(2)),
+ EmitScalarExpr(E->getArg(3))});
+ }
+
// r600 intrinsics
case AMDGPU::BI__builtin_r600_recipsqrt_ieee:
case AMDGPU::BI__builtin_r600_recipsqrt_ieeef:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 8a4533633706b2..e28e0a6987484b 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -436,6 +436,20 @@ void test_sched_group_barrier()
__builtin_amdgcn_sched_group_barrier(15, 10000, -1);
}
+// CHECK-LABEL: @test_sched_group_barrier_rule
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 0)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 100)
+// CHECK: call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i32 -100)
+void test_sched_group_barrier_rule()
+{
+ __builtin_amdgcn_sched_group_barrier(0, 1, 2, 0);
+ __builtin_amdgcn_sched_group_barrier(1, 2, 4, 0);
+ __builtin_amdgcn_sched_group_barrier(4, 8, 16, 100);
+ __builtin_amdgcn_sched_group_barrier(15, 10000, -1, -100);
+}
+
+
// CHECK-LABEL: @test_iglp_opt
// CHECK: call void @llvm.amdgcn.iglp.opt(i32 0)
// CHECK: call void @llvm.amdgcn.iglp.opt(i32 1)
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 051e603c0819d2..68fe42a8f04d21 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -297,10 +297,17 @@ def int_amdgcn_sched_barrier : ClangBuiltin<"__builtin_amdgcn_sched_barrier">,
// matching instructions that will be associated with this sched_group_barrier.
// The third parameter is an identifier which is used to describe what other
// sched_group_barriers should be synchronized with.
-def int_amdgcn_sched_group_barrier : ClangBuiltin<"__builtin_amdgcn_sched_group_barrier">,
- Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
- [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
- IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+multiclass SCHED_GROUP_BARRIER_I {
+ def NAME: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, IntrNoMem, IntrHasSideEffects,
+ IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+ def _rule: Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, IntrNoMem, IntrHasSideEffects,
+ IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+}
+
+
+defm int_amdgcn_sched_group_barrier : SCHED_GROUP_BARRIER_I;
// Scheduler optimization hint.
// MASK = 0: Small gemm opt
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 57769fe998d1fe..d158659141f795 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -248,6 +248,7 @@ class SchedGroup {
static void resetEdges(SUnit &SU, ScheduleDAGInstrs *DAG) {
assert(SU.getInstr()->getOpcode() == AMDGPU::SCHED_BARRIER ||
SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+ SU.getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE ||
SU.getInstr()->getOpcode() == AMDGPU::IGLP_OPT);
while (!SU.Preds.empty())
@@ -399,7 +400,8 @@ void PipelineSolver::reset() {
SmallVector<SUnit *, 32> TempCollection = SG.Collection;
SG.Collection.clear();
auto SchedBarr = llvm::find_if(TempCollection, [](SUnit *SU) {
- return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER;
+ return SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+ SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE;
});
if (SchedBarr != TempCollection.end())
SG.Collection.push_back(*SchedBarr);
@@ -457,7 +459,8 @@ void PipelineSolver::makePipeline() {
<< " has: \n");
SUnit *SGBarr = nullptr;
for (auto &SU : SG.Collection) {
- if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
+ if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+ SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE)
SGBarr = SU;
LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
}
@@ -496,7 +499,6 @@ int PipelineSolver::linkSUnit(
int PipelineSolver::addEdges(
SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
-
// For IsBottomUp, the first SchedGroup in SyncPipeline contains the
// instructions that are the ultimate successors in the resultant mutation.
// Therefore, in such a configuration, the SchedGroups occurring before the
@@ -2337,6 +2339,12 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
ScheduleDAGMI *DAG;
+ // An array of rule constuctors. These are to be used by
+ // SCHED_GROUP_BARRIER_RULE with the RuleID argument being
+ // an index into this array.
+ std::vector<function_ref<std::shared_ptr<InstructionRule>(unsigned)>>
+ SchedGroupBarrierRuleCallBacks;
+
// Organize lists of SchedGroups by their SyncID. SchedGroups /
// SCHED_GROUP_BARRIERs with different SyncIDs will have no edges added
// between then.
@@ -2368,6 +2376,10 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;
+ // Define the rules to be used with sched_group_barrier rules and register
+ // the constructors.
+ void addSchedGroupBarrierRules();
+
// The order in which the PipelineSolver should process the candidate
// SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
// created SchedGroup first, and will consider that as the ultimate
@@ -2379,7 +2391,9 @@ class IGroupLPDAGMutation : public ScheduleDAGMutation {
AMDGPU::SchedulingPhase Phase = AMDGPU::SchedulingPhase::Initial;
IGroupLPDAGMutation() = default;
- IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {}
+ IGroupLPDAGMutation(AMDGPU::SchedulingPhase Phase) : Phase(Phase) {
+ addSchedGroupBarrierRules();
+ }
};
unsigned SchedGroup::NumSchedGroups = 0;
@@ -2456,7 +2470,8 @@ int SchedGroup::link(SUnit &SU, bool MakePred,
int MissedEdges = 0;
for (auto *A : Collection) {
SUnit *B = &SU;
- if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
+ if (A == B || A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+ A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE)
continue;
if (MakePred)
std::swap(A, B);
@@ -2479,7 +2494,8 @@ int SchedGroup::link(SUnit &SU, bool MakePred,
void SchedGroup::link(SUnit &SU, bool MakePred) {
for (auto *A : Collection) {
SUnit *B = &SU;
- if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
+ if (A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+ A->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE)
continue;
if (MakePred)
std::swap(A, B);
@@ -2578,7 +2594,8 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
if (Opc == AMDGPU::SCHED_BARRIER) {
addSchedBarrierEdges(*R);
FoundSB = true;
- } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER) {
+ } else if (Opc == AMDGPU::SCHED_GROUP_BARRIER ||
+ Opc == AMDGPU::SCHED_GROUP_BARRIER_RULE) {
initSchedGroupBarrierPipelineStage(R);
FoundSB = true;
} else if (Opc == AMDGPU::IGLP_OPT) {
@@ -2658,21 +2675,96 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
return InvertedMask;
}
+void IGroupLPDAGMutation::addSchedGroupBarrierRules() {
+ /// Whether or not the instruction has no true data predecessors
+ /// with opcode \p Opc.
+ class NoOpcDataPred : public InstructionRule {
+ protected:
+ unsigned Opc;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ return !std::any_of(
+ SU->Preds.begin(), SU->Preds.end(), [this](const SDep &Pred) {
+ return Pred.getKind() == SDep::Data &&
+ Pred.getSUnit()->getInstr()->getOpcode() == Opc;
+ });
+ }
+
+ NoOpcDataPred(unsigned Opc, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Opc(Opc) {}
+ };
+
+ /// Whether or not the instruction has no write after read predecessors
+ /// with opcode \p Opc.
+ class NoOpcWARPred final : public InstructionRule {
+ protected:
+ unsigned Opc;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ return !std::any_of(
+ SU->Preds.begin(), SU->Preds.end(), [this](const SDep &Pred) {
+ return Pred.getKind() == SDep::Anti &&
+ Pred.getSUnit()->getInstr()->getOpcode() == Opc;
+ });
+ }
+ NoOpcWARPred(unsigned Opc, const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Opc(Opc){};
+ };
+
+ SchedGroupBarrierRuleCallBacks = {
+ [&](unsigned SGID) {
+ return std::make_shared<NoOpcWARPred>(AMDGPU::V_CNDMASK_B32_e64, TII,
+ SGID, false);
+ },
+ [&](unsigned SGID) {
+ return std::make_shared<NoOpcWARPred>(AMDGPU::V_PERM_B32_e64, TII, SGID,
+ false);
+ },
+ [&](unsigned SGID) {
+ return std::make_shared<NoOpcDataPred>(AMDGPU::V_CNDMASK_B32_e64, TII,
+ SGID, false);
+ },
+ [&](unsigned SGID) {
+ return std::make_shared<NoOpcDataPred>(AMDGPU::V_PERM_B32_e64, TII,
+ SGID, false);
+ }};
+}
+
void IGroupLPDAGMutation::initSchedGroupBarrierPipelineStage(
std::vector<SUnit>::reverse_iterator RIter) {
// Remove all existing edges from the SCHED_GROUP_BARRIER that were added due
// to the instruction having side effects.
resetEdges(*RIter, DAG);
MachineInstr &SGB = *RIter->getInstr();
- assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER);
+ assert(SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER ||
+ SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE);
int32_t SGMask = SGB.getOperand(0).getImm();
int32_t Size = SGB.getOperand(1).getImm();
int32_t SyncID = SGB.getOperand(2).getImm();
+ std::optional<int32_t> RuleID =
+ (SGB.getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE)
+ ? SGB.getOperand(3).getImm()
+ : std::optional<int32_t>(std::nullopt);
+
+ // Sanitize the input
+ if (RuleID && (!SchedGroupBarrierRuleCallBacks.size() ||
+ *RuleID > (int)(SchedGroupBarrierRuleCallBacks.size() - 1))) {
+ RuleID = std::nullopt;
+ llvm_unreachable("Bad rule ID!");
+ }
- auto &SG = SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
+ auto SG = &SyncedSchedGroups[SyncID].emplace_back((SchedGroupMask)SGMask,
Size, SyncID, DAG, TII);
+ if (RuleID)
+ SG->addRule(SchedGroupBarrierRuleCallBacks[*RuleID](SG->getSGID()));
- SG.initSchedGroup(RIter, SyncedInstrs[SG.getSyncID()]);
+ SG->initSchedGroup(RIter, SyncedInstrs[SG->getSyncID()]);
}
bool IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index c24d39b9e5fddf..6b25e518de3e80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -247,6 +247,20 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
return;
}
+ if (MI->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER_RULE) {
+ if (isVerbose()) {
+ std::string HexString;
+ raw_string_ostream HexStream(HexString);
+ HexStream << format_hex(MI->getOperand(0).getImm(), 10, true);
+ OutStreamer->emitRawComment(
+ " sched_group_barrier mask(" + HexString + ") size(" +
+ Twine(MI->getOperand(1).getImm()) + ") SyncID(" +
+ Twine(MI->getOperand(2).getImm()) + ")" + " Rule(" +
+ Twine(MI->getOperand(3).getImm()) + ")");
+ }
+ return;
+ }
+
if (MI->getOpcode() == AMDGPU::IGLP_OPT) {
if (isVerbose()) {
std::string HexString;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3ab788406ecb28..3602746a329115 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -402,6 +402,22 @@ def SCHED_GROUP_BARRIER : SPseudoInstSI<
let isMeta = 1;
}
+def SCHED_GROUP_BARRIER_RULE : SPseudoInstSI<
+ (outs),
+ (ins i32imm:$mask, i32imm:$size, i32imm:$syncid, i32imm:$ruleid),
+ [(int_amdgcn_sched_group_barrier_rule (i32 timm:$mask), (i32 timm:$size), (i32 timm:$syncid), (i32 timm:$ruleid))]> {
+ let SchedRW = [];
+ let hasNoSchedulingInfo = 1;
+ let hasSideEffects = 1;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let isConvergent = 1;
+ let FixedSize = 1;
+ let Size = 0;
+ let isMeta = 1;
+}
+
+
def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
[(int_amdgcn_iglp_opt (i32 timm:$mask))]> {
let SchedRW = [];
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 8464cb3d6fc43d..f967ec6202e0dc 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -133,7 +133,8 @@ bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock &MBB : MF) {
bool HasIGLPInstrs = llvm::any_of(MBB.instrs(), [](MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
- return Opc == AMDGPU::SCHED_GROUP_BARRIER || Opc == AMDGPU::IGLP_OPT;
+ return Opc == AMDGPU::SCHED_GROUP_BARRIER ||
+ Opc == AMDGPU::SCHED_GROUP_BARRIER_RULE || Opc == AMDGPU::IGLP_OPT;
});
// Don't cluster with IGLP instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
index 10f09b6390abae..e64c7e612ad99a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.ll
@@ -26,6 +26,30 @@ entry:
ret void
}
+define amdgpu_kernel void @test_sched_group_barrier_rule() #0 {
+; GCN-LABEL: test_sched_group_barrier_rule:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: ; sched_group_barrier mask(0x00000000) size(1) SyncID(2) Rule(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000001) size(2) SyncID(4) Rule(1)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000004) size(8) SyncID(16) Rule(2)
+; GCN-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1) Rule(3)
+; GCN-NEXT: s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_rule:
+; EXACTCUTOFF: ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000000) size(1) SyncID(2) Rule(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000001) size(2) SyncID(4) Rule(1)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000004) size(8) SyncID(16) Rule(2)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x0000000F) size(10000) SyncID(-1) Rule(3)
+; EXACTCUTOFF-NEXT: s_endpgm
+entry:
+ call void @llvm.amdgcn.sched.group.barrier.rule(i32 0, i32 1, i32 2, i32 0) #1
+ call void @llvm.amdgcn.sched.group.barrier.rule(i32 1, i32 2, i32 4, i32 1) #1
+ call void @llvm.amdgcn.sched.group.barrier.rule(i32 4, i32 8, i32 16, i32 2) #1
+ call void @llvm.amdgcn.sched.group.barrier.rule(i32 15, i32 10000, i32 -1, i32 3) #1
+ ret void
+}
+
define amdgpu_kernel void @test_sched_group_barrier_pipeline_READ_VALU_WRITE(ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %out) #0 {
; GCN-LABEL: test_sched_group_barrier_pipeline_READ_VALU_WRITE:
; GCN: ; %bb.0:
@@ -1615,6 +1639,7 @@ entry:
declare i32 @llvm.amdgcn.workitem.id.x() #2
declare void @llvm.amdgcn.sched.group.barrier(i32, i32, i32) #1
+declare void @llvm.amdgcn.sched.group.barrier.rule(i32, i32, i32, i32) #1
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32) #1
declare float @llvm.exp.f32(float) #2
More information about the cfe-commits
mailing list