[llvm] r267729 - AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic
Nicolai Haehnle via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 27 08:46:02 PDT 2016
Author: nha
Date: Wed Apr 27 10:46:01 2016
New Revision: 267729
URL: http://llvm.org/viewvc/llvm-project?rev=267729&view=rev
Log:
AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic
Summary:
So it appears that to guarantee some of the ordering requirements of a GLSL
memoryBarrier() executed in the shader, we need to emit an s_waitcnt.
(We can't use an s_barrier, because memoryBarrier() may appear anywhere in
the shader, in particular it may appear in non-uniform control flow.)
Reviewers: arsenm, mareko, tstellarAMD
Subscribers: arsenm, llvm-commits
Differential Revision: http://reviews.llvm.org/D19203
Added:
llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=267729&r1=267728&r2=267729&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Wed Apr 27 10:46:01 2016
@@ -68,6 +68,8 @@ defm int_amdgcn_workgroup_id : AMDGPURea
def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
Intrinsic<[], [], [IntrConvergent]>;
+def int_amdgcn_s_waitcnt : Intrinsic<[], [llvm_i32_ty], []>;
+
def int_amdgcn_div_scale : Intrinsic<
// 1st parameter: Numerator
// 2nd parameter: Denominator
Modified: llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp?rev=267729&r1=267728&r2=267729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaits.cpp Wed Apr 27 10:46:01 2016
@@ -68,6 +68,10 @@ private:
/// \brief Counter values we have already waited on.
Counters WaitedOn;
+ /// \brief Counter values that we must wait on before the next counter
+ /// increase.
+ Counters DelayedWaitOn;
+
/// \brief Counter values for last instruction issued.
Counters LastIssued;
@@ -103,13 +107,17 @@ private:
/// \brief Handle instructions async components
void pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I);
+ MachineBasicBlock::iterator I,
+ const Counters& Increment);
/// \brief Insert the actual wait instruction
bool insertWait(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const Counters &Counts);
+ /// \brief Handle existing wait instructions (from intrinsics)
+ void handleExistingWait(MachineBasicBlock::iterator I);
+
/// \brief Do we need def2def checks?
bool unorderedDefines(MachineInstr &MI);
@@ -287,10 +295,10 @@ RegInterval SIInsertWaits::getRegInterva
}
void SIInsertWaits::pushInstruction(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) {
+ MachineBasicBlock::iterator I,
+ const Counters &Increment) {
// Get the hardware counter increments and sum them up
- Counters Increment = getHwCounts(*I);
Counters Limit = ZeroCounts;
unsigned Sum = 0;
@@ -430,16 +438,38 @@ static void increaseCounters(Counters &D
Dst.Array[i] = std::max(Dst.Array[i], Src.Array[i]);
}
+/// \brief check whether any of the counters is non-zero
+static bool countersNonZero(const Counters &Counter) {
+ for (unsigned i = 0; i < 3; ++i)
+ if (Counter.Array[i])
+ return true;
+ return false;
+}
+
+void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
+ assert(I->getOpcode() == AMDGPU::S_WAITCNT);
+
+ unsigned Imm = I->getOperand(0).getImm();
+ Counters Counts, WaitOn;
+
+ Counts.Named.VM = Imm & 0xF;
+ Counts.Named.EXP = (Imm >> 4) & 0x7;
+ Counts.Named.LGKM = (Imm >> 8) & 0xF;
+
+ for (unsigned i = 0; i < 3; ++i) {
+ if (Counts.Array[i] <= LastIssued.Array[i])
+ WaitOn.Array[i] = LastIssued.Array[i] - Counts.Array[i];
+ else
+ WaitOn.Array[i] = 0;
+ }
+
+ increaseCounters(DelayedWaitOn, WaitOn);
+}
+
Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
Counters Result = ZeroCounts;
- // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
- // but we also want to wait for any other outstanding transfers before
- // signalling other hardware blocks
- if (MI.getOpcode() == AMDGPU::S_SENDMSG)
- return LastIssued;
-
// For each register affected by this instruction increase the result
// sequence.
//
@@ -544,6 +574,7 @@ bool SIInsertWaits::runOnMachineFunction
MRI = &MF.getRegInfo();
WaitedOn = ZeroCounts;
+ DelayedWaitOn = ZeroCounts;
LastIssued = ZeroCounts;
LastOpcodeType = OTHER;
LastInstWritesM0 = false;
@@ -552,6 +583,8 @@ bool SIInsertWaits::runOnMachineFunction
memset(&UsedRegs, 0, sizeof(UsedRegs));
memset(&DefinedRegs, 0, sizeof(DefinedRegs));
+ SmallVector<MachineInstr *, 4> RemoveMI;
+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
@@ -607,13 +640,34 @@ bool SIInsertWaits::runOnMachineFunction
I->getOpcode() == AMDGPU::V_READFIRSTLANE_B32)
TII->insertWaitStates(MBB, std::next(I), 4);
+ // Record pre-existing, explicitly requested waits
+ if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+ handleExistingWait(*I);
+ RemoveMI.push_back(I);
+ continue;
+ }
+
+ Counters Required;
+
// Wait for everything before a barrier.
- if (I->getOpcode() == AMDGPU::S_BARRIER)
- Changes |= insertWait(MBB, I, LastIssued);
+ //
+ // S_SENDMSG implicitly waits for all outstanding LGKM transfers to finish,
+ // but we also want to wait for any other outstanding transfers before
+ // signalling other hardware blocks
+ if (I->getOpcode() == AMDGPU::S_BARRIER ||
+ I->getOpcode() == AMDGPU::S_SENDMSG)
+ Required = LastIssued;
else
- Changes |= insertWait(MBB, I, handleOperands(*I));
+ Required = handleOperands(*I);
+
+ Counters Increment = getHwCounts(*I);
- pushInstruction(MBB, I);
+ if (countersNonZero(Required) || countersNonZero(Increment))
+ increaseCounters(Required, DelayedWaitOn);
+
+ Changes |= insertWait(MBB, I, Required);
+
+ pushInstruction(MBB, I, Increment);
handleSendMsg(MBB, I);
}
@@ -621,5 +675,8 @@ bool SIInsertWaits::runOnMachineFunction
Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
}
+ for (MachineInstr *I : RemoveMI)
+ I->eraseFromParent();
+
return Changes;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=267729&r1=267728&r2=267729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Wed Apr 27 10:46:01 2016
@@ -43,8 +43,9 @@ def SWaitMatchClass : AsmOperandClass {
let ParserMethod = "parseSWaitCntOps";
}
-def WAIT_FLAG : InstFlag<"printWaitFlag"> {
+def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
+ let PrintMethod = "printWaitFlag";
}
let SubtargetPredicate = isGCN in {
@@ -506,6 +507,7 @@ def S_BARRIER : SOPP <0x0000000a, (ins),
let isConvergent = 1;
}
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
@@ -2452,6 +2454,11 @@ def : Pat <
// SOPP Patterns
//===----------------------------------------------------------------------===//
+def : Pat <
+ (int_amdgcn_s_waitcnt i32:$simm16),
+ (S_WAITCNT (as_i16imm $simm16))
+>;
+
// FIXME: These should be removed eventually
def : Pat <
(int_AMDGPU_barrier_global),
Added: llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll?rev=267729&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll Wed Apr 27 10:46:01 2016
@@ -0,0 +1,38 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: image_store
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
+; CHECK-NEXT: image_store
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) {
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+ call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+ ret void
+}
+
+; Test that the intrinsic is merged with automatically generated waits and
+; emitted as late as possible.
+;
+; CHECK-LABEL: {{^}}test2:
+; CHECK: image_load
+; CHECK-NOT: s_waitcnt vmcnt(0){{$}}
+; CHECK: s_waitcnt
+; CHECK-NEXT: image_store
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, i32 %c) {
+ %t = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ call void @llvm.amdgcn.s.waitcnt(i32 3840) ; 0xf00
+ %c.1 = mul i32 %c, 2
+ call void @llvm.amdgcn.image.store.i32(<4 x float> %t, i32 %c.1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+ ret void
+}
+
+declare void @llvm.amdgcn.s.waitcnt(i32) #0
+
+declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
More information about the llvm-commits
mailing list