[llvm] r279464 - AMDGPU: Split SILowerControlFlow into two pieces
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 22 12:33:16 PDT 2016
Author: arsenm
Date: Mon Aug 22 14:33:16 2016
New Revision: 279464
URL: http://llvm.org/viewvc/llvm-project?rev=279464&view=rev
Log:
AMDGPU: Split SILowerControlFlow into two pieces
Do most of the lowering in a pre-RA pass. Keep the skip jump
insertion late, plus a few other things that require more
work to move out.
One concern I have is now there may be COPY instructions
which do not have the necessary implicit exec uses
if they will be lowered to v_mov_b32.
This has a positive effect on SGPR usage in shader-db.
Added:
llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
llvm/trunk/test/CodeGen/AMDGPU/else.ll
llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Mon Aug 22 14:33:16 2016
@@ -39,7 +39,6 @@ FunctionPass *createSILowerI1CopiesPass(
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
FunctionPass *createSIWholeQuadModePass();
-FunctionPass *createSILowerControlFlowPass();
FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIFixSGPRCopiesPass();
FunctionPass *createSIDebuggerInsertNopsPass();
@@ -69,8 +68,10 @@ void initializeSIWholeQuadModePass(PassR
extern char &SIWholeQuadModeID;
void initializeSILowerControlFlowPass(PassRegistry &);
-extern char &SILowerControlFlowPassID;
+extern char &SILowerControlFlowID;
+void initializeSIInsertSkipsPass(PassRegistry &);
+extern char &SIInsertSkipsPassID;
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca(const TargetMachine *TM = nullptr);
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Mon Aug 22 14:33:16 2016
@@ -80,6 +80,7 @@ extern "C" void LLVMInitializeAMDGPUTarg
initializeSIInsertWaitsPass(*PR);
initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);
+ initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
}
@@ -532,13 +533,6 @@ bool GCNPassConfig::addGlobalInstruction
#endif
void GCNPassConfig::addPreRegAlloc() {
- // This needs to be run directly before register allocation because
- // earlier passes might recompute live intervals.
- // TODO: handle CodeGenOpt::None; fast RA ignores spill weights set by the pass
- if (getOptLevel() > CodeGenOpt::None) {
- insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
- }
-
if (getOptLevel() > CodeGenOpt::None) {
// Don't do this with no optimizations since it throws away debug info by
// merging nonadjacent loads.
@@ -556,10 +550,22 @@ void GCNPassConfig::addPreRegAlloc() {
}
void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+ // FIXME: We have to disable the verifier here because of PHIElimination +
+ // TwoAddressInstructions disabling it.
+ insertPass(&TwoAddressInstructionPassID, &SILowerControlFlowID, false);
+
TargetPassConfig::addFastRegAlloc(RegAllocPass);
}
void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+ // This needs to be run directly before register allocation because earlier
+ // passes might recompute live intervals.
+ insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);
+
+ // TODO: It might be better to run this right after phi elimination, but for
+ // now that would require not running the verifier.
+ insertPass(&RenameIndependentSubregsID, &SILowerControlFlowID);
+
TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}
@@ -579,7 +585,7 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
- addPass(createSILowerControlFlowPass());
+ addPass(&SIInsertSkipsPassID);
addPass(createSIDebuggerInsertNopsPass());
}
Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Mon Aug 22 14:33:16 2016
@@ -67,6 +67,7 @@ add_llvm_target(AMDGPUCodeGen
SIFixSGPRCopies.cpp
SIFoldOperands.cpp
SIFrameLowering.cpp
+ SIInsertSkips.cpp
SIInsertWaits.cpp
SIInstrInfo.cpp
SIISelLowering.cpp
Added: llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp?rev=279464&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/SIInsertSkips.cpp Mon Aug 22 14:33:16 2016
@@ -0,0 +1,330 @@
+//===-- SIInsertSkips.cpp - Use predicates for control flow ----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass inserts branches on the 0 exec mask over divergent branches
+/// branches when it's expected that jumping over the untaken control flow will
+/// be cheaper than having every workitem no-op through it.
+//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-skips"
+
+namespace {
+
+static cl::opt<unsigned> SkipThresholdFlag(
+ "amdgpu-skip-threshold",
+ cl::desc("Number of instructions before jumping over divergent control flow"),
+ cl::init(12), cl::Hidden);
+
+class SIInsertSkips : public MachineFunctionPass {
+private:
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+ unsigned SkipThreshold;
+
+ bool shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const;
+
+ bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+
+ void kill(MachineInstr &MI);
+
+ MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const;
+
+ bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
+
+public:
+ static char ID;
+
+ SIInsertSkips() :
+ MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ const char *getPassName() const override {
+ return "SI insert s_cbranch_execz instructions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace
+
+char SIInsertSkips::ID = 0;
+
+INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
+ "SI insert s_cbranch_execz instructions", false, false)
+
+char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
+
+static bool opcodeEmitsNoInsts(unsigned Opc) {
+ switch (Opc) {
+ case TargetOpcode::IMPLICIT_DEF:
+ case TargetOpcode::KILL:
+ case TargetOpcode::BUNDLE:
+ case TargetOpcode::CFI_INSTRUCTION:
+ case TargetOpcode::EH_LABEL:
+ case TargetOpcode::GC_LABEL:
+ case TargetOpcode::DBG_VALUE:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
+ const MachineBasicBlock &To) const {
+ if (From.succ_empty())
+ return false;
+
+ unsigned NumInstr = 0;
+ const MachineFunction *MF = From.getParent();
+
+ for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+ MBBI != End && MBBI != ToI; ++MBBI) {
+ const MachineBasicBlock &MBB = *MBBI;
+
+ for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+ NumInstr < SkipThreshold && I != E; ++I) {
+ if (opcodeEmitsNoInsts(I->getOpcode()))
+ continue;
+
+ // FIXME: Since this is required for correctness, this should be inserted
+ // during SILowerControlFlow.
+
+ // When a uniform loop is inside non-uniform control flow, the branch
+ // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
+ // when EXEC = 0. We should skip the loop lest it becomes infinite.
+ if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
+ I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+ return true;
+
+ if (I->isInlineAsm()) {
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+ const char *AsmStr = I->getOperand(0).getSymbolName();
+
+ // inlineasm length estimate is number of bytes assuming the longest
+ // instruction.
+ uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
+ NumInstr += MaxAsmSize / MAI->getMaxInstLength();
+ } else {
+ ++NumInstr;
+ }
+
+ if (NumInstr >= SkipThreshold)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ MachineFunction *MF = MBB.getParent();
+
+ if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
+ !shouldSkip(MBB, MBB.getParent()->back()))
+ return false;
+
+ MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ // If the exec mask is non-zero, skip the next two instructions
+ BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&NextBB);
+
+ MachineBasicBlock::iterator Insert = SkipBB->begin();
+
+ // Exec mask is zero: Export to NULL target...
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
+ .addImm(0)
+ .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+ .addImm(0)
+ .addImm(1)
+ .addImm(1)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef)
+ .addReg(AMDGPU::VGPR0, RegState::Undef);
+
+ // ... and terminate wavefront.
+ BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+
+ return true;
+}
+
+void SIInsertSkips::kill(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MI.getDebugLoc();
+ const MachineOperand &Op = MI.getOperand(0);
+
+#ifndef NDEBUG
+ CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
+ // Kill is only allowed in pixel / geometry shaders.
+ assert(CallConv == CallingConv::AMDGPU_PS ||
+ CallConv == CallingConv::AMDGPU_GS);
+#endif
+ // Clear this thread from the exec mask if the operand is negative.
+ if (Op.isImm()) {
+ // Constant operand: Set exec mask to 0 or do nothing
+ if (Op.getImm() & 0x80000000) {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addImm(0);
+ }
+ } else {
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
+ .addImm(0)
+ .addOperand(Op);
+ }
+}
+
+MachineBasicBlock *SIInsertSkips::insertSkipBlock(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
+ MachineFunction *MF = MBB.getParent();
+
+ MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF->insert(MBBI, SkipBB);
+ MBB.addSuccessor(SkipBB);
+
+ return SkipBB;
+}
+
+// Returns true if a branch over the block was inserted.
+bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
+ MachineBasicBlock &SrcMBB) {
+ MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
+
+ if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
+ return false;
+
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
+
+ BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+ .addMBB(DestBB);
+
+ return true;
+}
+
+bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ TII = ST.getInstrInfo();
+ TRI = &TII->getRegisterInfo();
+ SkipThreshold = SkipThresholdFlag;
+
+ bool HaveKill = false;
+ bool MadeChange = false;
+
+ // Track depth of exec mask, divergent branches.
+ SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
+
+ MachineFunction::iterator NextBB;
+
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
+ MachineBasicBlock &MBB = *BI;
+
+ if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
+ // Reached convergence point for last divergent branch.
+ ExecBranchStack.pop_back();
+ }
+
+ if (HaveKill && ExecBranchStack.empty()) {
+ HaveKill = false;
+
+ // TODO: Insert skip if exec is 0?
+ }
+
+ MachineBasicBlock::iterator I, Next;
+ for (I = MBB.begin(); I != MBB.end(); I = Next) {
+ Next = std::next(I);
+
+ MachineInstr &MI = *I;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::SI_MASK_BRANCH: {
+ ExecBranchStack.push_back(MI.getOperand(0).getMBB());
+ MadeChange |= skipMaskBranch(MI, MBB);
+ break;
+ }
+ case AMDGPU::S_BRANCH: {
+ // Optimize out branches to the next block.
+ // FIXME: Shouldn't this be handled by BranchFolding?
+ if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB()))
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_KILL_TERMINATOR: {
+ MadeChange = true;
+ kill(MI);
+
+ if (ExecBranchStack.empty()) {
+ if (skipIfDead(MI, *NextBB)) {
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+ } else {
+ HaveKill = true;
+ }
+
+ MI.eraseFromParent();
+ break;
+ }
+ case AMDGPU::SI_RETURN: {
+ // FIXME: Should move somewhere else
+ assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+ // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+ // because external bytecode will be appended at the end.
+ if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
+ // SI_RETURN is not the last instruction. Add an empty block at
+ // the end and jump there.
+ if (!EmptyMBBAtEnd) {
+ EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+ MF.insert(MF.end(), EmptyMBBAtEnd);
+ }
+
+ MBB.addSuccessor(EmptyMBBAtEnd);
+ BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(EmptyMBBAtEnd);
+ I->eraseFromParent();
+ }
+ }
+ default:
+ break;
+ }
+ }
+ }
+
+ return MadeChange;
+}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon Aug 22 14:33:16 2016
@@ -1807,6 +1807,7 @@ def SI_MASK_BRANCH : PseudoInstSI <
let isTerminator = 1;
let isBarrier = 0;
let SALU = 1;
+ let Uses = [EXEC];
}
let Uses = [EXEC], Defs = [EXEC, SCC] in {
Modified: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp Mon Aug 22 14:33:16 2016
@@ -58,8 +58,6 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/MC/MCAsmInfo.h"
using namespace llvm;
@@ -67,46 +65,41 @@ using namespace llvm;
namespace {
-static cl::opt<unsigned> SkipThresholdFlag(
- "amdgpu-skip-threshold",
- cl::desc("Number of instructions before jumping over divergent control flow"),
- cl::init(12), cl::Hidden);
-
class SILowerControlFlow : public MachineFunctionPass {
private:
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
- unsigned SkipThreshold;
-
- bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
-
- MachineInstr *Skip(MachineInstr &From, MachineOperand &To);
- bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
+ LiveIntervals *LIS;
- void If(MachineInstr &MI);
- void Else(MachineInstr &MI);
- void Break(MachineInstr &MI);
- void IfBreak(MachineInstr &MI);
- void ElseBreak(MachineInstr &MI);
- void Loop(MachineInstr &MI);
- void EndCf(MachineInstr &MI);
+ void emitIf(MachineInstr &MI);
+ void emitElse(MachineInstr &MI);
+ void emitBreak(MachineInstr &MI);
+ void emitIfBreak(MachineInstr &MI);
+ void emitElseBreak(MachineInstr &MI);
+ void emitLoop(MachineInstr &MI);
+ void emitEndCf(MachineInstr &MI);
- void Kill(MachineInstr &MI);
- void Branch(MachineInstr &MI);
-
- MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const;
public:
static char ID;
SILowerControlFlow() :
- MachineFunctionPass(ID), TRI(nullptr), TII(nullptr), SkipThreshold(0) { }
+ MachineFunctionPass(ID),
+ TRI(nullptr),
+ TII(nullptr),
+ LIS(nullptr) {}
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
return "SI Lower control flow pseudo instructions";
}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
};
} // End anonymous namespace
@@ -114,403 +107,236 @@ public:
char SILowerControlFlow::ID = 0;
INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
- "SI lower control flow", false, false)
+ "SI lower control flow", false, false)
-char &llvm::SILowerControlFlowPassID = SILowerControlFlow::ID;
+char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
+void SILowerControlFlow::emitIf(MachineInstr &MI) {
+ MachineBasicBlock &MBB = *MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator I(&MI);
-FunctionPass *llvm::createSILowerControlFlowPass() {
- return new SILowerControlFlow();
-}
+ MachineOperand &SaveExec = MI.getOperand(0);
+ MachineOperand &Cond = MI.getOperand(1);
+ assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
+ Cond.getSubReg() == AMDGPU::NoSubRegister);
+
+ unsigned SaveExecReg = SaveExec.getReg();
+
+ MachineInstr *AndSaveExec =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), SaveExecReg)
+ .addOperand(Cond);
+
+ MachineInstr *Xor =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), SaveExecReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(SaveExecReg);
+
+ // Insert a pseudo terminator to help keep the verifier happy. This will also
+ // be used later when inserting skips.
+ MachineInstr *NewBr =
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addOperand(MI.getOperand(2))
+ .addReg(SaveExecReg, getKillRegState(SaveExec.isKill()));
-static bool opcodeEmitsNoInsts(unsigned Opc) {
- switch (Opc) {
- case TargetOpcode::IMPLICIT_DEF:
- case TargetOpcode::KILL:
- case TargetOpcode::BUNDLE:
- case TargetOpcode::CFI_INSTRUCTION:
- case TargetOpcode::EH_LABEL:
- case TargetOpcode::GC_LABEL:
- case TargetOpcode::DBG_VALUE:
- return true;
- default:
- return false;
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
}
-}
-bool SILowerControlFlow::shouldSkip(MachineBasicBlock *From,
- MachineBasicBlock *To) {
- if (From->succ_empty())
- return false;
-
- unsigned NumInstr = 0;
- MachineFunction *MF = From->getParent();
-
- for (MachineFunction::iterator MBBI(From), ToI(To), End = MF->end();
- MBBI != End && MBBI != ToI; ++MBBI) {
- MachineBasicBlock &MBB = *MBBI;
-
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
- NumInstr < SkipThreshold && I != E; ++I) {
- if (opcodeEmitsNoInsts(I->getOpcode()))
- continue;
-
- // When a uniform loop is inside non-uniform control flow, the branch
- // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
- // when EXEC = 0. We should skip the loop lest it becomes infinite.
- if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
- I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
- return true;
-
- if (I->isInlineAsm()) {
- const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
- const char *AsmStr = I->getOperand(0).getSymbolName();
-
- // inlineasm length estimate is number of bytes assuming the longest
- // instruction.
- uint64_t MaxAsmSize = TII->getInlineAsmLength(AsmStr, *MAI);
- NumInstr += MaxAsmSize / MAI->getMaxInstLength();
- } else {
- ++NumInstr;
- }
- if (NumInstr >= SkipThreshold)
- return true;
- }
- }
+ LIS->ReplaceMachineInstrInMaps(MI, *AndSaveExec);
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*NewBr);
- return false;
-}
+ MI.eraseFromParent();
-MachineInstr *SILowerControlFlow::Skip(MachineInstr &From, MachineOperand &To) {
- if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
- return nullptr;
-
- const DebugLoc &DL = From.getDebugLoc();
- MachineInstr *Skip =
- BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
- .addOperand(To);
- return Skip;
+ // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
+ // hard to add another def here but I'm not sure how to correctly update the
+ // valno.
+ LIS->removeInterval(SaveExecReg);
+ LIS->createAndComputeVirtRegInterval(SaveExecReg);
}
-bool SILowerControlFlow::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
+void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- MachineFunction *MF = MBB.getParent();
-
- if (MF->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS ||
- !shouldSkip(&MBB, &MBB.getParent()->back()))
- return false;
-
- MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
- MBB.addSuccessor(SkipBB);
-
const DebugLoc &DL = MI.getDebugLoc();
- // If the exec mask is non-zero, skip the next two instructions
- BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addMBB(&NextBB);
-
- MachineBasicBlock::iterator Insert = SkipBB->begin();
-
- // Exec mask is zero: Export to NULL target...
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP))
- .addImm(0)
- .addImm(0x09) // V_008DFC_SQ_EXP_NULL
- .addImm(0)
- .addImm(1)
- .addImm(1)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef)
- .addReg(AMDGPU::VGPR0, RegState::Undef);
+ unsigned DstReg = MI.getOperand(0).getReg();
+ assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
- // ... and terminate wavefront.
- BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
+ bool ExecModified = MI.getOperand(3).getImm() != 0;
+ MachineBasicBlock::iterator Start = MBB.begin();
- return true;
-}
-
-void SILowerControlFlow::If(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned Vcc = MI.getOperand(1).getReg();
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
- .addReg(Vcc);
+ // This must be inserted before phis and any spill code inserted before the
+ // else.
+ MachineInstr *OrSaveExec =
+ BuildMI(MBB, Start, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), DstReg)
+ .addOperand(MI.getOperand(1)); // Saved EXEC
+ MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
+
+ MachineBasicBlock::iterator ElsePt(MI);
+
+ if (ExecModified) {
+ MachineInstr *And =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_AND_B64), DstReg)
+ .addReg(AMDGPU::EXEC)
+ .addReg(DstReg);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
- .addReg(AMDGPU::EXEC)
- .addReg(Reg);
-
- MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*And);
+ }
- // Insert before the new branch instruction.
- MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
+ MachineInstr *Xor =
+ BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(DstReg);
+ MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
// Insert a pseudo terminator to help keep the verifier happy.
- BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
- .addOperand(MI.getOperand(2))
- .addReg(Reg);
+ MachineInstr *Branch =
+ BuildMI(MBB, Term, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
+ .addMBB(DestBB)
+ .addReg(DstReg);
- MI.eraseFromParent();
-}
-
-void SILowerControlFlow::Else(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
-
- BuildMI(MBB, MBB.getFirstNonPHI(), DL,
- TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
- .addReg(Src); // Saved EXEC
-
- if (MI.getOperand(3).getImm() != 0) {
- // Adjust the saved exec to account for the modifications during the flow
- // block that contains the ELSE. This can happen when WQM mode is switched
- // off.
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .addReg(Dst);
+ if (!LIS) {
+ MI.eraseFromParent();
+ return;
}
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Dst);
-
- MachineInstr *SkipInst = Skip(MI, MI.getOperand(2));
-
- // Insert before the new branch instruction.
- MachineInstr *InsPt = SkipInst ? SkipInst : &MI;
-
- // Insert a pseudo terminator to help keep the verifier happy.
- BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_MASK_BRANCH))
- .addOperand(MI.getOperand(2))
- .addReg(Dst);
-
+ LIS->RemoveMachineInstrFromMaps(MI);
MI.eraseFromParent();
-}
-void SILowerControlFlow::Break(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
+ LIS->InsertMachineInstrInMaps(*OrSaveExec);
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src = MI.getOperand(1).getReg();
+ LIS->InsertMachineInstrInMaps(*Xor);
+ LIS->InsertMachineInstrInMaps(*Branch);
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(AMDGPU::EXEC)
- .addReg(Src);
+ // src reg is tied to dst reg.
+ LIS->removeInterval(DstReg);
+ LIS->createAndComputeVirtRegInterval(DstReg);
- MI.eraseFromParent();
+ // Let this be recomputed.
+ LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
}
-void SILowerControlFlow::IfBreak(MachineInstr &MI) {
+void SILowerControlFlow::emitBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
+ const DebugLoc &DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
- unsigned Vcc = MI.getOperand(1).getReg();
- unsigned Src = MI.getOperand(2).getReg();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(Vcc)
- .addReg(Src);
+ MachineInstr *Or =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(1));
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *Or);
MI.eraseFromParent();
}
-void SILowerControlFlow::ElseBreak(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
-
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Saved = MI.getOperand(1).getReg();
- unsigned Src = MI.getOperand(2).getReg();
-
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
- .addReg(Saved)
- .addReg(Src);
+void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
+}
- MI.eraseFromParent();
+void SILowerControlFlow::emitElseBreak(MachineInstr &MI) {
+ MI.setDesc(TII->get(AMDGPU::S_OR_B64));
}
-void SILowerControlFlow::Loop(MachineInstr &MI) {
+void SILowerControlFlow::emitLoop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Src = MI.getOperand(0).getReg();
+ const DebugLoc &DL = MI.getDebugLoc();
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Src);
+ MachineInstr *AndN2 =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ MachineInstr *Branch =
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addOperand(MI.getOperand(1));
- MI.eraseFromParent();
-}
-
-void SILowerControlFlow::EndCf(MachineInstr &MI) {
- MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- unsigned Reg = MI.getOperand(0).getReg();
-
- BuildMI(MBB, MBB.getFirstNonPHI(), DL,
- TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(Reg);
+ if (LIS) {
+ LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
+ LIS->InsertMachineInstrInMaps(*Branch);
+ }
MI.eraseFromParent();
}
-void SILowerControlFlow::Branch(MachineInstr &MI) {
- MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
- if (MBB == MI.getParent()->getNextNode())
- MI.eraseFromParent();
-
- // If these aren't equal, this is probably an infinite loop.
-}
-
-void SILowerControlFlow::Kill(MachineInstr &MI) {
+void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
- DebugLoc DL = MI.getDebugLoc();
- const MachineOperand &Op = MI.getOperand(0);
-
-#ifndef NDEBUG
- CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv();
- // Kill is only allowed in pixel / geometry shaders.
- assert(CallConv == CallingConv::AMDGPU_PS ||
- CallConv == CallingConv::AMDGPU_GS);
-#endif
-
- // Clear this thread from the exec mask if the operand is negative
- if ((Op.isImm())) {
- // Constant operand: Set exec mask to 0 or do nothing
- if (Op.getImm() & 0x80000000) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addImm(0);
- }
- } else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32))
- .addImm(0)
- .addOperand(Op);
- }
-
- MI.eraseFromParent();
-}
+ const DebugLoc &DL = MI.getDebugLoc();
-MachineBasicBlock *SILowerControlFlow::insertSkipBlock(
- MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
- MachineFunction *MF = MBB.getParent();
+ MachineBasicBlock::iterator InsPt = MBB.begin();
+ MachineInstr *NewMI =
+ BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addOperand(MI.getOperand(0));
- MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
- MachineFunction::iterator MBBI(MBB);
- ++MBBI;
+ if (LIS)
+ LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
- MF->insert(MBBI, SkipBB);
+ MI.eraseFromParent();
- return SkipBB;
+ if (LIS)
+ LIS->handleMove(*NewMI);
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- SkipThreshold = SkipThresholdFlag;
- bool HaveKill = false;
- unsigned Depth = 0;
+ // This doesn't actually need LiveIntervals, but we can preserve them.
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
MachineFunction::iterator NextBB;
-
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; BI = NextBB) {
NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
- MachineBasicBlock *EmptyMBBAtEnd = nullptr;
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
-
MachineInstr &MI = *I;
switch (MI.getOpcode()) {
- default: break;
- case AMDGPU::SI_IF:
- ++Depth;
- If(MI);
- break;
-
- case AMDGPU::SI_ELSE:
- Else(MI);
- break;
-
- case AMDGPU::SI_BREAK:
- Break(MI);
- break;
-
- case AMDGPU::SI_IF_BREAK:
- IfBreak(MI);
- break;
-
- case AMDGPU::SI_ELSE_BREAK:
- ElseBreak(MI);
- break;
-
- case AMDGPU::SI_LOOP:
- ++Depth;
- Loop(MI);
- break;
-
- case AMDGPU::SI_END_CF:
- if (--Depth == 0 && HaveKill) {
- HaveKill = false;
- // TODO: Insert skip if exec is 0?
- }
-
- EndCf(MI);
- break;
-
- case AMDGPU::SI_KILL_TERMINATOR:
- if (Depth == 0) {
- if (skipIfDead(MI, *NextBB)) {
- NextBB = std::next(BI);
- BE = MF.end();
- }
- } else
- HaveKill = true;
- Kill(MI);
- break;
-
- case AMDGPU::S_BRANCH:
- Branch(MI);
- break;
-
- case AMDGPU::SI_RETURN: {
- assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
- // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
- // because external bytecode will be appended at the end.
- if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
- // SI_RETURN is not the last instruction. Add an empty block at
- // the end and jump there.
- if (!EmptyMBBAtEnd) {
- EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
- MF.insert(MF.end(), EmptyMBBAtEnd);
- }
-
- MBB.addSuccessor(EmptyMBBAtEnd);
- BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(EmptyMBBAtEnd);
- I->eraseFromParent();
- }
- break;
- }
+ case AMDGPU::SI_IF:
+ emitIf(MI);
+ break;
+
+ case AMDGPU::SI_ELSE:
+ emitElse(MI);
+ break;
+
+ case AMDGPU::SI_BREAK:
+ emitBreak(MI);
+ break;
+
+ case AMDGPU::SI_IF_BREAK:
+ emitIfBreak(MI);
+ break;
+
+ case AMDGPU::SI_ELSE_BREAK:
+ emitElseBreak(MI);
+ break;
+
+ case AMDGPU::SI_LOOP:
+ emitLoop(MI);
+ break;
+
+ case AMDGPU::SI_END_CF:
+ emitEndCf(MI);
+ break;
+
+ default:
+ break;
}
}
}
+
return true;
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/else.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/else.ll?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/else.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/else.ll Mon Aug 22 14:33:16 2016
@@ -25,11 +25,13 @@ end:
}
; CHECK-LABEL: {{^}}else_execfix_leave_wqm:
+; CHECK: ; BB#0:
+; CHECK-NEXT: s_mov_b64 [[INIT_EXEC:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: ; %Flow
; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
-; CHECK-NEXT: s_and_b64 exec, exec,
-; CHECK-NEXT: s_and_b64 [[DST]], exec, [[DST]]
-; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
+; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
+; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
+; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
; CHECK-NEXT: ; mask branch
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
main_body:
Modified: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll?rev=279464&r1=279463&r2=279464&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll Mon Aug 22 14:33:16 2016
@@ -2,11 +2,33 @@
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
-; SI-LABEL: @test_if
+; SI-LABEL: {{^}}test_if:
; Make sure the i1 values created by the cfg structurizer pass are
; moved using VALU instructions
+
+
+; waitcnt should be inserted after exec modification
+; SI: v_cmp_lt_i32_e32 vcc, 0,
+; SI-NEXT: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc
+; SI-NEXT: s_xor_b64 [[SAVE]], exec, [[SAVE]]
+; SI-NEXT: s_waitcnt lgkmcnt(0)
+; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
+; SI-NEXT: s_cbranch_execz [[FLOW_BB]]
+
+; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
; SI: v_mov_b32_e32 v{{[0-9]}}, -1
+; SI: s_and_saveexec_b64
+; SI-NEXT: s_xor_b64
+; SI-NEXT: ; mask branch
+
+; v_mov should be after exec modification
+; SI: [[FLOW_BB]]:
+; SI-NEXT: s_or_saveexec_b64 [[SAVE]], [[SAVE]]
+; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
+; SI-NEXT: s_xor_b64 exec, exec, [[SAVE]]
+; SI-NEXT: ; mask branch
+;
define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -17,12 +39,12 @@ entry:
case0:
%arrayidx1 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
- store i32 0, i32 addrspace(1)* %arrayidx1, align 4
+ store i32 13, i32 addrspace(1)* %arrayidx1, align 4
br label %end
case1:
%arrayidx5 = getelementptr i32, i32 addrspace(1)* %dst, i32 %b
- store i32 1, i32 addrspace(1)* %arrayidx5, align 4
+ store i32 17, i32 addrspace(1)* %arrayidx5, align 4
br label %end
default:
@@ -31,11 +53,11 @@ default:
br i1 %cmp8, label %if, label %else
if:
- store i32 2, i32 addrspace(1)* %arrayidx10, align 4
+ store i32 19, i32 addrspace(1)* %arrayidx10, align 4
br label %end
else:
- store i32 3, i32 addrspace(1)* %arrayidx10, align 4
+ store i32 21, i32 addrspace(1)* %arrayidx10, align 4
br label %end
end:
@@ -139,10 +161,11 @@ exit:
; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
; SI: [[LABEL_FLOW]]:
-; SI: s_or_b64 exec, exec, [[ORNEG2]]
-; SI: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
-; SI: s_andn2_b64 exec, exec, [[COND_STATE]]
-; SI: s_cbranch_execnz [[LABEL_LOOP]]
+; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
+; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
+; SI-NEXT: s_or_b64 [[COND_STATE]], [[ORNEG2]], [[TMP]]
+; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]
+; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]
; SI: BB#5
; SI: s_or_b64 exec, exec, [[COND_STATE]]
More information about the llvm-commits
mailing list