[llvm] r277500 - AMDGPU: Track physical registers in SIWholeQuadMode
Hans Wennborg via llvm-commits
llvm-commits at lists.llvm.org
Wed Aug 3 11:17:38 PDT 2016
Merged in r277619.
Thanks,
Hans
On Wed, Aug 3, 2016 at 10:49 AM, Tom Stellard <tom at stellard.net> wrote:
> On Wed, Aug 03, 2016 at 09:04:04AM -0700, Hans Wennborg wrote:
>> Sounds good to me if Tom approves.
>>
>
> This is fine with me.
>
> -Tom
>> On Wed, Aug 3, 2016 at 5:07 AM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
>> > Hi Hans,
>> >
>> > this is a bug fix that should go into the 3.9 release branch.
>> >
>> > Thanks,
>> > Nicolai
>> >
>> >
>> > On 02.08.2016 21:17, Nicolai Haehnle via llvm-commits wrote:
>> >>
>> >> Author: nha
>> >> Date: Tue Aug 2 14:17:37 2016
>> >> New Revision: 277500
>> >>
>> >> URL: http://llvm.org/viewvc/llvm-project?rev=277500&view=rev
>> >> Log:
>> >> AMDGPU: Track physical registers in SIWholeQuadMode
>> >>
>> >> Summary:
>> >> There are cases where uniform branch conditions are computed in VGPRs, and
>> >> we didn't correctly mark those as WQM.
>> >>
>> >> The stray change in basic-branch.ll is because invoking the LiveIntervals
>> >> analysis leads to the detection of a dead register that would otherwise
>> >> not
>> >> be seen at -O0.
>> >>
>> >> This is a candidate for the 3.9 branch, as it fixes a possible hang.
>> >>
>> >> Reviewers: arsenm, tstellarAMD, mareko
>> >>
>> >> Subscribers: arsenm, llvm-commits, kzhuravl
>> >>
>> >> Differential Revision: https://reviews.llvm.org/D22673
>> >>
>> >> Modified:
>> >> llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
>> >> llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll
>> >> llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
>> >>
>> >> Modified: llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp
>> >> URL:
>> >> http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp?rev=277500&r1=277499&r2=277500&view=diff
>> >>
>> >> ==============================================================================
>> >> --- llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp (original)
>> >> +++ llvm/trunk/lib/Target/AMDGPU/SIWholeQuadMode.cpp Tue Aug 2 14:17:37
>> >> 2016
>> >> @@ -94,12 +94,15 @@ private:
>> >> const SIInstrInfo *TII;
>> >> const SIRegisterInfo *TRI;
>> >> MachineRegisterInfo *MRI;
>> >> + LiveIntervals *LIS;
>> >>
>> >> DenseMap<const MachineInstr *, InstrInfo> Instructions;
>> >> DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
>> >> SmallVector<const MachineInstr *, 2> ExecExports;
>> >> SmallVector<MachineInstr *, 1> LiveMaskQueries;
>> >>
>> >> + void markInstruction(MachineInstr &MI, char Flag,
>> >> + std::vector<WorkItem> &Worklist);
>> >> char scanInstructions(MachineFunction &MF, std::vector<WorkItem>
>> >> &Worklist);
>> >> void propagateInstruction(MachineInstr &MI, std::vector<WorkItem>
>> >> &Worklist);
>> >> void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem>
>> >> &Worklist);
>> >> @@ -126,6 +129,7 @@ public:
>> >> }
>> >>
>> >> void getAnalysisUsage(AnalysisUsage &AU) const override {
>> >> + AU.addRequired<LiveIntervals>();
>> >> AU.setPreservesCFG();
>> >> MachineFunctionPass::getAnalysisUsage(AU);
>> >> }
>> >> @@ -135,8 +139,11 @@ public:
>> >>
>> >> char SIWholeQuadMode::ID = 0;
>> >>
>> >> -INITIALIZE_PASS(SIWholeQuadMode, DEBUG_TYPE,
>> >> - "SI Whole Quad Mode", false, false)
>> >> +INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode",
>> >> false,
>> >> + false)
>> >> +INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
>> >> +INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode",
>> >> false,
>> >> + false)
>> >>
>> >> char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
>> >>
>> >> @@ -144,6 +151,23 @@ FunctionPass *llvm::createSIWholeQuadMod
>> >> return new SIWholeQuadMode;
>> >> }
>> >>
>> >> +void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
>> >> + std::vector<WorkItem> &Worklist) {
>> >> + InstrInfo &II = Instructions[&MI];
>> >> +
>> >> + assert(Flag == StateWQM || Flag == StateExact);
>> >> +
>> >> + // Ignore if the instruction is already marked. The typical case is
>> >> that we
>> >> + // mark an instruction WQM multiple times, but for atomics it can
>> >> happen that
>> >> + // Flag is StateWQM, but Needs is already set to StateExact. In this
>> >> case,
>> >> + // letting the atomic run in StateExact is correct as per the relevant
>> >> specs.
>> >> + if (II.Needs)
>> >> + return;
>> >> +
>> >> + II.Needs = Flag;
>> >> + Worklist.push_back(&MI);
>> >> +}
>> >> +
>> >> // Scan instructions to determine which ones require an Exact execmask
>> >> and
>> >> // which ones seed WQM requirements.
>> >> char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
>> >> @@ -192,8 +216,7 @@ char SIWholeQuadMode::scanInstructions(M
>> >> continue;
>> >> }
>> >>
>> >> - Instructions[&MI].Needs = Flags;
>> >> - Worklist.push_back(&MI);
>> >> + markInstruction(MI, Flags, Worklist);
>> >> GlobalFlags |= Flags;
>> >> }
>> >>
>> >> @@ -249,32 +272,35 @@ void SIWholeQuadMode::propagateInstructi
>> >> if (!Use.isReg() || !Use.isUse())
>> >> continue;
>> >>
>> >> - // At this point, physical registers appear as inputs or outputs
>> >> - // and following them makes no sense (and would in fact be incorrect
>> >> - // when the same VGPR is used as both an output and an input that
>> >> leads
>> >> - // to a NeedsWQM instruction).
>> >> - //
>> >> - // Note: VCC appears e.g. in 64-bit addition with carry -
>> >> theoretically we
>> >> - // have to trace this, in practice it happens for 64-bit computations
>> >> like
>> >> - // pointers where both dwords are followed already anyway.
>> >> - if (!TargetRegisterInfo::isVirtualRegister(Use.getReg()))
>> >> - continue;
>> >> -
>> >> - for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg())) {
>> >> - InstrInfo &DefII = Instructions[&DefMI];
>> >> + unsigned Reg = Use.getReg();
>> >>
>> >> - // Obviously skip if DefMI is already flagged as NeedWQM.
>> >> - //
>> >> - // The instruction might also be flagged as NeedExact. This happens
>> >> when
>> >> - // the result of an atomic is used in a WQM computation. In this
>> >> case,
>> >> - // the atomic must not run for helper pixels and the WQM result is
>> >> - // undefined.
>> >> - if (DefII.Needs != 0)
>> >> + // Handle physical registers that we need to track; this is mostly
>> >> relevant
>> >> + // for VCC, which can appear as the (implicit) input of a uniform
>> >> branch,
>> >> + // e.g. when a loop counter is stored in a VGPR.
>> >> + if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
>> >> + if (Reg == AMDGPU::EXEC)
>> >> continue;
>> >>
>> >> - DefII.Needs = StateWQM;
>> >> - Worklist.push_back(&DefMI);
>> >> + for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid();
>> >> ++RegUnit) {
>> >> + LiveRange &LR = LIS->getRegUnit(*RegUnit);
>> >> + const VNInfo *Value =
>> >> LR.Query(LIS->getInstructionIndex(MI)).valueIn();
>> >> + if (!Value)
>> >> + continue;
>> >> +
>> >> + // Since we're in machine SSA, we do not need to track physical
>> >> + // registers across basic blocks.
>> >> + if (Value->isPHIDef())
>> >> + continue;
>> >> +
>> >> + markInstruction(*LIS->getInstructionFromIndex(Value->def),
>> >> StateWQM,
>> >> + Worklist);
>> >> + }
>> >> +
>> >> + continue;
>> >> }
>> >> +
>> >> + for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
>> >> + markInstruction(DefMI, StateWQM, Worklist);
>> >> }
>> >> }
>> >>
>> >> @@ -471,6 +497,7 @@ bool SIWholeQuadMode::runOnMachineFuncti
>> >> TII = ST.getInstrInfo();
>> >> TRI = &TII->getRegisterInfo();
>> >> MRI = &MF.getRegInfo();
>> >> + LIS = &getAnalysis<LiveIntervals>();
>> >>
>> >> char GlobalFlags = analyzeFunction(MF);
>> >> if (!(GlobalFlags & StateWQM)) {
>> >>
>> >> Modified: llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll
>> >> URL:
>> >> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll?rev=277500&r1=277499&r2=277500&view=diff
>> >>
>> >> ==============================================================================
>> >> --- llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll (original)
>> >> +++ llvm/trunk/test/CodeGen/AMDGPU/basic-branch.ll Tue Aug 2 14:17:37
>> >> 2016
>> >> @@ -6,7 +6,6 @@
>> >> ; GCN-LABEL: {{^}}test_branch:
>> >> ; GCNNOOPT: v_writelane_b32
>> >> ; GCNNOOPT: v_writelane_b32
>> >> -; GCNNOOPT: v_writelane_b32
>> >> ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
>> >>
>> >> ; GCN: ; BB#1
>> >>
>> >> Modified: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
>> >> URL:
>> >> http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wqm.ll?rev=277500&r1=277499&r2=277500&view=diff
>> >>
>> >> ==============================================================================
>> >> --- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll (original)
>> >> +++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll Tue Aug 2 14:17:37 2016
>> >> @@ -350,6 +350,44 @@ main_body:
>> >> ret float %s
>> >> }
>> >>
>> >> +; CHECK-LABEL: {{^}}test_loop_vcc:
>> >> +; CHECK-NEXT: ; %entry
>> >> +; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
>> >> +; CHECK: s_wqm_b64 exec, exec
>> >> +; CHECK: s_and_b64 exec, exec, [[LIVE]]
>> >> +; CHECK: image_store
>> >> +; CHECK: s_wqm_b64 exec, exec
>> >> +; CHECK: v_mov_b32_e32 [[CTR:v[0-9]+]], -2
>> >> +; CHECK: s_branch [[LOOPHDR:BB[0-9]+_[0-9]+]]
>> >> +
>> >> +; CHECK: [[LOOPHDR]]: ; %loop
>> >> +; CHECK: v_add_i32_e32 [[CTR]], vcc, 2, [[CTR]]
>> >> +; CHECK: v_cmp_lt_i32_e32 vcc, 7, [[CTR]]
>> >> +; CHECK: s_cbranch_vccz
>> >> +; CHECK: ; %break
>> >> +
>> >> +; CHECK: ; return
>> >> +define amdgpu_ps <4 x float> @test_loop_vcc(<4 x float> %in) nounwind {
>> >> +entry:
>> >> + call void @llvm.amdgcn.image.store.v4i32(<4 x float> %in, <4 x i32>
>> >> undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
>> >> + br label %loop
>> >> +
>> >> +loop:
>> >> + %ctr.iv = phi i32 [ 0, %entry ], [ %ctr.next, %body ]
>> >> + %c.iv = phi <4 x float> [ %in, %entry ], [ %c.next, %body ]
>> >> + %cc = icmp sgt i32 %ctr.iv, 7
>> >> + br i1 %cc, label %break, label %body
>> >> +
>> >> +body:
>> >> + %c.i = bitcast <4 x float> %c.iv to <4 x i32>
>> >> + %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i,
>> >> <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0,
>> >> i32 0, i32 0)
>> >> + %ctr.next = add i32 %ctr.iv, 2
>> >> + br label %loop
>> >> +
>> >> +break:
>> >> + ret <4 x float> %c.iv
>> >> +}
>> >> +
>> >> declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x
>> >> i32>, i32, i1, i1, i1, i1) #1
>> >>
>> >> declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>,
>> >> i32, i1, i1, i1, i1) #2
>> >>
>> >>
>> >> _______________________________________________
>> >> llvm-commits mailing list
>> >> llvm-commits at lists.llvm.org
>> >> http://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-commits
>> >>
>> >
More information about the llvm-commits
mailing list