[llvm] r273467 - AMDGPU: Fix verifier errors in SILowerControlFlow
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 22 13:15:28 PDT 2016
Author: arsenm
Date: Wed Jun 22 15:15:28 2016
New Revision: 273467
URL: http://llvm.org/viewvc/llvm-project?rev=273467&view=rev
Log:
AMDGPU: Fix verifier errors in SILowerControlFlow
The main sin this was committing was using terminator
instructions in the middle of the block, and then
not updating the block successors / predecessors.
Split the blocks up to avoid this and introduce new
pseudo instructions for branches taken with exec masking.
Also use a pseudo instead of emitting s_endpgm and erasing
it in the special case of a non-void return.
Modified:
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/AMDGPU/R600Instructions.td
llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll
llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp Wed Jun 22 15:15:28 2016
@@ -652,7 +652,7 @@ AMDGPUTargetLowering::LowerReturn(SDValu
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SDLoc &DL, SelectionDAG &DAG) const {
- return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+ return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
}
//===---------------------------------------------------------------------===//
@@ -2722,10 +2722,11 @@ const char* AMDGPUTargetLowering::getTar
// AMDIL DAG nodes
NODE_NAME_CASE(CALL);
NODE_NAME_CASE(UMUL);
- NODE_NAME_CASE(RET_FLAG);
NODE_NAME_CASE(BRANCH_COND);
// AMDGPU DAG nodes
+ NODE_NAME_CASE(ENDPGM)
+ NODE_NAME_CASE(RETURN)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(CLAMP)
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h Wed Jun 22 15:15:28 2016
@@ -219,9 +219,10 @@ enum NodeType : unsigned {
FIRST_NUMBER = ISD::BUILTIN_OP_END,
CALL, // Function call based on a single integer
UMUL, // 32bit unsigned multiplication
- RET_FLAG,
BRANCH_COND,
// End AMDIL ISD Opcodes
+ ENDPGM,
+ RETURN,
DWORDADDR,
FRACT,
CLAMP,
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUInstrInfo.td Wed Jun 22 15:15:28 2016
@@ -261,5 +261,8 @@ def IL_brcond : SDNode<"AMDGPUISD::
//===----------------------------------------------------------------------===//
// Call/Return DAG Nodes
//===----------------------------------------------------------------------===//
-def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
+ [SDNPHasChain, SDNPOptInGlue]>;
+
+def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp Wed Jun 22 15:15:28 2016
@@ -17,7 +17,6 @@
#include "AMDGPUAsmPrinter.h"
#include "AMDGPUTargetMachine.h"
#include "InstPrinter/AMDGPUInstPrinter.h"
-#include "R600InstrInfo.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -107,6 +106,29 @@ void AMDGPUAsmPrinter::EmitInstruction(c
++I;
}
} else {
+ // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
+ // terminator instructions and should only be printed as comments.
+ if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
+ if (isVerbose()) {
+ SmallVector<char, 16> BBStr;
+ raw_svector_ostream Str(BBStr);
+
+ const MachineBasicBlock *MBB = MI->getOperand(1).getMBB();
+ const MCSymbolRefExpr *Expr
+ = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+ Expr->print(Str, MAI);
+ OutStreamer->emitRawComment(" mask branch " + BBStr);
+ }
+
+ return;
+ }
+
+ if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+ if (isVerbose())
+ OutStreamer->emitRawComment(" return");
+ return;
+ }
+
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Wed Jun 22 15:15:28 2016
@@ -448,8 +448,8 @@ void GCNPassConfig::addPreEmitPass() {
addPass(createSIInsertWaitsPass());
addPass(createSIShrinkInstructionsPass());
- addPass(createSILowerControlFlowPass(), false);
- addPass(createSIDebuggerInsertNopsPass(), false);
+ addPass(createSILowerControlFlowPass());
+ addPass(createSIDebuggerInsertNopsPass());
}
TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
Modified: llvm/trunk/lib/Target/AMDGPU/R600Instructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/R600Instructions.td?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/R600Instructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/R600Instructions.td Wed Jun 22 15:15:28 2016
@@ -1539,8 +1539,9 @@ let isTerminator = 1, usesCustomInserter
//===---------------------------------------------------------------------===//
let isTerminator = 1, isReturn = 1, hasCtrlDep = 1,
usesCustomInserter = 1 in {
- def RETURN : ILFormat<(outs), (ins variable_ops),
- "RETURN", [(IL_retflag)]>;
+ def RETURN : ILFormat<(outs), (ins variable_ops),
+ "RETURN", [(AMDGPUendpgm)]
+ >;
}
//===----------------------------------------------------------------------===//
Modified: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp Wed Jun 22 15:15:28 2016
@@ -1008,7 +1008,8 @@ SITargetLowering::LowerReturn(SDValue Ch
if (Flag.getNode())
RetOps.push_back(Flag);
- return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps);
+ unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+ return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
@@ -1469,8 +1470,8 @@ SDValue SITargetLowering::lowerTRAP(SDVa
// FIXME: This should really be selected to s_trap, but that requires
// setting up the trap handler for it o do anything.
- return DAG.getNode(AMDGPUISD::RET_FLAG, SDLoc(Op), MVT::Other, Op.
- getOperand(0));
+ return DAG.getNode(AMDGPUISD::ENDPGM, SDLoc(Op), MVT::Other,
+ Op.getOperand(0));
}
SDValue SITargetLowering::copyToM0(SelectionDAG &DAG, SDValue Chain,
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td Wed Jun 22 15:15:28 2016
@@ -11,8 +11,9 @@
//
//===----------------------------------------------------------------------===//
-class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
- AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
+class InstSI <dag outs, dag ins, string asm = "",
+ list<dag> pattern = []> :
+ AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
field bits<1> VM_CNT = 0;
field bits<1> EXP_CNT = 0;
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Wed Jun 22 15:15:28 2016
@@ -426,7 +426,7 @@ def S_NOP : SOPP <0x00000000, (ins i16im
let isTerminator = 1 in {
def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
- [(IL_retflag)]> {
+ [(AMDGPUendpgm)]> {
let simm16 = 0;
let isBarrier = 1;
let hasCtrlDep = 1;
@@ -1908,7 +1908,7 @@ def V_MOV_B64_PSEUDO : InstSI <(outs VRe
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0
let hasSideEffects = 1, SALU = 1 in {
-def SGPR_USE : InstSI <(outs),(ins), "", []>;
+def SGPR_USE : InstSI <(outs), (ins)>;
}
let usesCustomInserter = 1, SALU = 1 in {
@@ -1919,61 +1919,57 @@ def GET_GROUPSTATICSIZE : InstSI <(outs
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in {
+let hasSideEffects = 1, isPseudo = 1, isCodeGenOnly = 1 in {
+
+// Dummy terminator instruction to use after control flow instructions
+// replaced with exec mask operations.
+def SI_MASK_BRANCH : InstSI <
+ (outs SReg_64:$dst), (ins brtarget:$target)> {
+ let isBranch = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let SALU = 1;
+}
+
let Uses = [EXEC], Defs = [EXEC] in {
let isBranch = 1, isTerminator = 1 in {
def SI_IF: InstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$vcc, brtarget:$target),
- "",
+ (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), "",
[(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))]
>;
def SI_ELSE : InstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$src, brtarget:$target),
- "",
- [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]
-> {
+ (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target), "",
+ [(set i64:$dst, (int_amdgcn_else i64:$src, bb:$target))]> {
let Constraints = "$src = $dst";
}
def SI_LOOP : InstSI <
- (outs),
- (ins SReg_64:$saved, brtarget:$target),
- "si_loop $saved, $target",
+ (outs), (ins SReg_64:$saved, brtarget:$target), "",
[(int_amdgcn_loop i64:$saved, bb:$target)]
>;
} // End isBranch = 1, isTerminator = 1
def SI_BREAK : InstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$src),
- "si_else $dst, $src",
+ (outs SReg_64:$dst), (ins SReg_64:$src), "",
[(set i64:$dst, (int_amdgcn_break i64:$src))]
>;
def SI_IF_BREAK : InstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$vcc, SReg_64:$src),
- "si_if_break $dst, $vcc, $src",
+ (outs SReg_64:$dst), (ins SReg_64:$vcc, SReg_64:$src), "",
[(set i64:$dst, (int_amdgcn_if_break i1:$vcc, i64:$src))]
>;
def SI_ELSE_BREAK : InstSI <
- (outs SReg_64:$dst),
- (ins SReg_64:$src0, SReg_64:$src1),
- "si_else_break $dst, $src0, $src1",
+ (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), "",
[(set i64:$dst, (int_amdgcn_else_break i64:$src0, i64:$src1))]
>;
def SI_END_CF : InstSI <
- (outs),
- (ins SReg_64:$saved),
- "si_end_cf $saved",
+ (outs), (ins SReg_64:$saved), "",
[(int_amdgcn_end_cf i64:$saved)]
>;
@@ -1981,30 +1977,24 @@ def SI_END_CF : InstSI <
let Uses = [EXEC], Defs = [EXEC,VCC] in {
def SI_KILL : InstSI <
- (outs),
- (ins VSrc_32:$src),
- "si_kill $src",
+ (outs), (ins VSrc_32:$src), "",
[(int_AMDGPU_kill f32:$src)]
>;
} // End Uses = [EXEC], Defs = [EXEC,VCC]
} // End mayLoad = 1, mayStore = 1, hasSideEffects = 1
-let SALU = 1 in
def SI_PS_LIVE : InstSI <
- (outs SReg_64:$dst),
- (ins),
- "si_ps_live $dst",
- [(set i1:$dst, (int_amdgcn_ps_live))]
->;
+ (outs SReg_64:$dst), (ins), "",
+ [(set i1:$dst, (int_amdgcn_ps_live))]> {
+ let SALU = 1;
+}
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
// fold operands before it runs.
-def SI_INIT_M0 : InstSI <
- (outs),
- (ins SSrc_32:$src), "", []> {
+def SI_INIT_M0 : InstSI <(outs), (ins SSrc_32:$src)> {
let Defs = [M0];
let usesCustomInserter = 1;
let isPseudo = 1;
@@ -2014,21 +2004,28 @@ def SI_INIT_M0 : InstSI <
let isReMaterializable = 1;
}
+def SI_RETURN : InstSI <
+ (outs), (ins variable_ops), "", [(AMDGPUreturn)]> {
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let isPseudo = 1;
+ let isCodeGenOnly = 1;
+ let hasSideEffects = 1;
+ let SALU = 1;
+ let hasNoSchedulingInfo = 1;
+}
+
let Uses = [EXEC], Defs = [EXEC, VCC, M0] in {
class SI_INDIRECT_SRC<RegisterClass rc> : InstSI <
(outs VGPR_32:$dst, SReg_64:$temp),
- (ins rc:$src, VSrc_32:$idx, i32imm:$off),
- "si_indirect_src $dst, $temp, $src, $idx, $off",
- []
+ (ins rc:$src, VSrc_32:$idx, i32imm:$off)
>;
class SI_INDIRECT_DST<RegisterClass rc> : InstSI <
(outs rc:$dst, SReg_64:$temp),
- (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val),
- "si_indirect_dst $dst, $temp, $src, $idx, $off, $val",
- []
-> {
+ (ins unknown:$src, VSrc_32:$idx, i32imm:$off, VGPR_32:$val)> {
let Constraints = "$src = $dst";
}
@@ -2052,16 +2049,14 @@ multiclass SI_SPILL_SGPR <RegisterClass
let UseNamedOperandTable = 1, Uses = [EXEC] in {
def _SAVE : InstSI <
(outs),
- (ins sgpr_class:$src, i32imm:$frame_idx),
- "", []> {
+ (ins sgpr_class:$src, i32imm:$frame_idx)> {
let mayStore = 1;
let mayLoad = 0;
}
def _RESTORE : InstSI <
(outs sgpr_class:$dst),
- (ins i32imm:$frame_idx),
- "", []> {
+ (ins i32imm:$frame_idx)> {
let mayStore = 0;
let mayLoad = 1;
}
@@ -2082,8 +2077,7 @@ multiclass SI_SPILL_VGPR <RegisterClass
def _SAVE : InstSI <
(outs),
(ins vgpr_class:$src, i32imm:$frame_idx, SReg_128:$scratch_rsrc,
- SReg_32:$scratch_offset, i32imm:$offset),
- "", []> {
+ SReg_32:$scratch_offset, i32imm:$offset)> {
let mayStore = 1;
let mayLoad = 0;
}
@@ -2091,8 +2085,7 @@ multiclass SI_SPILL_VGPR <RegisterClass
def _RESTORE : InstSI <
(outs vgpr_class:$dst),
(ins i32imm:$frame_idx, SReg_128:$scratch_rsrc, SReg_32:$scratch_offset,
- i32imm:$offset),
- "", []> {
+ i32imm:$offset)> {
let mayStore = 0;
let mayLoad = 1;
}
Modified: llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerControlFlow.cpp Wed Jun 22 15:15:28 2016
@@ -88,10 +88,14 @@ private:
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
- void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
+ void emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB, DebugLoc DL,
+ MachineInstr *MovRel,
+ unsigned SaveReg, unsigned IdxReg, int Offset);
+
+ bool loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
- void IndirectSrc(MachineInstr &MI);
- void IndirectDst(MachineInstr &MI);
+ bool indirectSrc(MachineInstr &MI);
+ bool indirectDst(MachineInstr &MI);
public:
static char ID;
@@ -104,11 +108,6 @@ public:
const char *getPassName() const override {
return "SI Lower control flow pseudo instructions";
}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
};
} // End anonymous namespace
@@ -227,6 +226,10 @@ void SILowerControlFlow::If(MachineInstr
Skip(MI, MI.getOperand(2));
+ // Insert a pseudo terminator to help keep the verifier happy.
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Reg)
+ .addOperand(MI.getOperand(2));
+
MI.eraseFromParent();
}
@@ -255,6 +258,10 @@ void SILowerControlFlow::Else(MachineIns
Skip(MI, MI.getOperand(2));
+ // Insert a pseudo terminator to help keep the verifier happy.
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_MASK_BRANCH), Dst)
+ .addOperand(MI.getOperand(2));
+
MI.eraseFromParent();
}
@@ -331,7 +338,8 @@ void SILowerControlFlow::EndCf(MachineIn
}
void SILowerControlFlow::Branch(MachineInstr &MI) {
- if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
+ MachineBasicBlock *MBB = MI.getOperand(0).getMBB();
+ if (MBB == MI.getParent()->getNextNode())
MI.eraseFromParent();
// If these aren't equal, this is probably an infinite loop.
@@ -365,75 +373,109 @@ void SILowerControlFlow::Kill(MachineIns
MI.eraseFromParent();
}
-void SILowerControlFlow::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
+void SILowerControlFlow::emitLoadM0FromVGPRLoop(MachineBasicBlock &LoopBB,
+ DebugLoc DL,
+ MachineInstr *MovRel,
+ unsigned SaveReg,
+ unsigned IdxReg,
+ int Offset) {
+ MachineBasicBlock::iterator I = LoopBB.begin();
+
+ // Read the next variant into VCC (lower 32 bits) <- also loop target
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), AMDGPU::VCC_LO)
+ .addReg(IdxReg);
+
+ // Move index from VCC into M0
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(AMDGPU::VCC_LO);
+
+ // Compare the just read M0 value to all possible Idx values
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
+ .addReg(AMDGPU::M0)
+ .addReg(IdxReg);
+
+ // Update EXEC, save the original EXEC value to VCC
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
+ .addReg(AMDGPU::VCC);
+
+ if (Offset) {
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(AMDGPU::M0)
+ .addImm(Offset);
+ }
+
+ // Do the actual move
+ LoopBB.insert(I, MovRel);
+
+ // Update EXEC, switch all done bits to 0 and all todo bits to 1
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
+ .addReg(AMDGPU::EXEC)
+ .addReg(AMDGPU::VCC);
+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
+ BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(&LoopBB);
+}
+
+// Returns true if a new block was inserted.
+bool SILowerControlFlow::loadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MachineBasicBlock::iterator I = MI;
- unsigned Save = MI.getOperand(1).getReg();
unsigned Idx = MI.getOperand(3).getReg();
if (AMDGPU::SReg_32RegClass.contains(Idx)) {
if (Offset) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(Idx)
- .addImm(Offset);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
+ .addReg(Idx)
+ .addImm(Offset);
} else {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(Idx);
+ BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addReg(Idx);
}
- MBB.insert(I, MovRel);
- } else {
-
- assert(AMDGPU::SReg_64RegClass.contains(Save));
- assert(AMDGPU::VGPR_32RegClass.contains(Idx));
-
- // Save the EXEC mask
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
- .addReg(AMDGPU::EXEC);
-
- // Read the next variant into VCC (lower 32 bits) <- also loop target
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- AMDGPU::VCC_LO)
- .addReg(Idx);
-
- // Move index from VCC into M0
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
- .addReg(AMDGPU::VCC_LO);
-
- // Compare the just read M0 value to all possible Idx values
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32))
- .addReg(AMDGPU::M0)
- .addReg(Idx);
- // Update EXEC, save the original EXEC value to VCC
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
- .addReg(AMDGPU::VCC);
-
- if (Offset) {
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
- .addReg(AMDGPU::M0)
- .addImm(Offset);
- }
- // Do the actual move
MBB.insert(I, MovRel);
+ MI.eraseFromParent();
+ return false;
+ }
- // Update EXEC, switch all done bits to 0 and all todo bits to 1
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
- .addReg(AMDGPU::VCC);
+ MachineFunction &MF = *MBB.getParent();
+ unsigned Save = MI.getOperand(1).getReg();
- // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
- .addImm(-7);
-
- // Restore EXEC
- BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
- .addReg(Save);
+ // Reading from a VGPR requires looping over all workitems in the wavefront.
+ assert(AMDGPU::SReg_64RegClass.contains(Save) &&
+ AMDGPU::VGPR_32RegClass.contains(Idx));
+
+ // Save the EXEC mask
+ BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
+ .addReg(AMDGPU::EXEC);
+
+ // To insert the loop we need to split the block. Move everything after this
+ // point to a new block, and insert a new empty block between the two.
+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock();
+ MachineFunction::iterator MBBI(MBB);
+ ++MBBI;
+
+ MF.insert(MBBI, LoopBB);
+ MF.insert(MBBI, RemainderBB);
+
+ LoopBB->addSuccessor(LoopBB);
+ LoopBB->addSuccessor(RemainderBB);
+
+ // Move the rest of the block into a new block.
+ RemainderBB->transferSuccessors(&MBB);
+ RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+
+ emitLoadM0FromVGPRLoop(*LoopBB, DL, MovRel, Save, Idx, Offset);
+
+ MachineBasicBlock::iterator First = RemainderBB->begin();
+ BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
+ .addReg(Save);
- }
MI.eraseFromParent();
+ return true;
}
/// \param @VecReg The register which holds element zero of the vector
@@ -463,8 +505,8 @@ void SILowerControlFlow::computeIndirect
Reg = RC->getRegister(RegIdx);
}
-void SILowerControlFlow::IndirectSrc(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
@@ -480,11 +522,11 @@ void SILowerControlFlow::IndirectSrc(Mac
.addReg(Reg)
.addReg(Vec, RegState::Implicit);
- LoadM0(MI, MovRel, Off);
+ return loadM0(MI, MovRel, Off);
}
-void SILowerControlFlow::IndirectDst(MachineInstr &MI) {
-
+// Return true if a new block was inserted.
+bool SILowerControlFlow::indirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
@@ -501,7 +543,7 @@ void SILowerControlFlow::IndirectDst(Mac
.addReg(Val)
.addReg(Dst, RegState::Implicit);
- LoadM0(MI, MovRel, Off);
+ return loadM0(MI, MovRel, Off);
}
bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
@@ -514,11 +556,14 @@ bool SILowerControlFlow::runOnMachineFun
bool NeedFlat = false;
unsigned Depth = 0;
- for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
- BI != BE; ++BI) {
+ MachineFunction::iterator NextBB;
- MachineBasicBlock *EmptyMBBAtEnd = NULL;
+ for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+ BI != BE; BI = NextBB) {
+ NextBB = std::next(BI);
MachineBasicBlock &MBB = *BI;
+
+ MachineBasicBlock *EmptyMBBAtEnd = nullptr;
MachineBasicBlock::iterator I, Next;
bool ExecModified = false;
@@ -591,7 +636,15 @@ bool SILowerControlFlow::runOnMachineFun
case AMDGPU::SI_INDIRECT_SRC_V4:
case AMDGPU::SI_INDIRECT_SRC_V8:
case AMDGPU::SI_INDIRECT_SRC_V16:
- IndirectSrc(MI);
+ if (indirectSrc(MI)) {
+ // The block was split at this point. We can safely skip the middle
+ // inserted block to the following which contains the rest of this
+ // block's instructions.
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+
break;
case AMDGPU::SI_INDIRECT_DST_V1:
@@ -599,7 +652,15 @@ bool SILowerControlFlow::runOnMachineFun
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
- IndirectDst(MI);
+ if (indirectDst(MI)) {
+ // The block was split at this point. We can safely skip the middle
+ // inserted block to the following which contains the rest of this
+ // block's instructions.
+ NextBB = std::next(BI);
+ BE = MF.end();
+ Next = MBB.end();
+ }
+
break;
case AMDGPU::S_ENDPGM: {
Modified: llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll Wed Jun 22 15:15:28 2016
@@ -174,6 +174,213 @@ entry:
ret void
}
+; When the block is split to insert the loop, make sure any other
+; places that need to be expanded in the same block are also handled.
+
+; CHECK-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:
+
+; CHECK: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: s_mov_b32 [[S_ELT0:s[0-9]+]], 7
+; CHECK-DAG: s_mov_b32 [[S_ELT1:s[0-9]+]], 9
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], [[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], [[S_ELT1]]
+; CHECK: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL0:v[0-9]+]], [[VEC_ELT0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movrels_b32_e32 [[MOVREL1:v[0-9]+]], [[VEC_ELT1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dword [[MOVREL0]]
+; CHECK: buffer_store_dword [[MOVREL1]]
+define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+ %idx0 = load volatile i32, i32 addrspace(1)* %gep
+ %idx1 = add i32 %idx0, 1
+ %val0 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx0
+ %val1 = extractelement <4 x i32> <i32 7, i32 9, i32 11, i32 13>, i32 %idx1
+ store volatile i32 %val0, i32 addrspace(1)* %out0
+ store volatile i32 %val1, i32 addrspace(1)* %out0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
+; CHECK-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}
+; CHECK-DAG: {{buffer|flat}}_load_dword [[IDX0:v[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], s[[S_ELT0]]
+; CHECK-DAG: v_mov_b32_e32 [[INS0:v[0-9]+]], 62
+; CHECK-DAG: s_waitcnt vmcnt(0)
+
+; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], exec
+
+; CHECK: [[LOOP0:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL0:[0-9]+]], [[INS0]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP0]]
+
+; FIXME: Redundant copy
+; CHECK: s_mov_b64 exec, [[MASK]]
+; CHECK: v_mov_b32_e32 [[INS1:v[0-9]+]], 63
+; CHECK: s_mov_b64 [[MASK]], exec
+
+; CHECK: [[LOOP1:BB[0-9]+_[0-9]+]]:
+; CHECK: v_readfirstlane_b32 vcc_lo, [[IDX0]]
+; CHECK: s_mov_b32 m0, vcc_lo
+; CHECK: v_cmp_eq_u32_e32 vcc, m0, [[IDX0]]
+; CHECK: s_and_saveexec_b64 vcc, vcc
+; CHECK-NEXT: v_movreld_b32_e32 v[[MOVREL1:[0-9]+]], [[INS1]]
+; CHECK-NEXT: s_xor_b64 exec, exec, vcc
+; CHECK: s_cbranch_execnz [[LOOP1]]
+
+; CHECK: buffer_store_dwordx4 v{{\[}}[[MOVREL0]]:
+define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+entry:
+ %id = call i32 @llvm.amdgcn.workitem.id.x() #1
+ %id.ext = zext i32 %id to i64
+ %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
+ %idx0 = load volatile i32, i32 addrspace(1)* %gep
+ %idx1 = add i32 %idx0, 1
+ %vec1 = insertelement <4 x i32> %vec0, i32 62, i32 %idx0
+ %vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1
+ store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0
+ ret void
+}
+
+; CHECK-LABEL: {{^}}extract_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movrels_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @extract_adjacent_blocks(i32 %arg) #0 {
+bb:
+ %tmp = icmp eq i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb4
+
+bb1:
+ %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp3 = extractelement <4 x float> %tmp2, i32 undef
+ br label %bb7
+
+bb4:
+ %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp6 = extractelement <4 x float> %tmp5, i32 undef
+ br label %bb7
+
+bb7:
+ %tmp8 = phi float [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+ store volatile float %tmp8, float addrspace(1)* undef
+ ret void
+}
+
+; CHECK-LABEL: {{^}}insert_adjacent_blocks:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK: s_cmp_lg_i32
+; CHECK: s_cbranch_scc0 [[BB4:BB[0-9]+_[0-9]+]]
+
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]
+
+; CHECK: [[BB4]]:
+; CHECK: buffer_load_dwordx4
+; CHECK: s_mov_b32 m0,
+; CHECK: v_movreld_b32_e32
+
+; CHECK: [[ENDBB]]:
+; CHECK: buffer_store_dword
+; CHECK: s_endpgm
+define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+bb:
+ %tmp = icmp eq i32 %arg, 0
+ br i1 %tmp, label %bb1, label %bb4
+
+bb1: ; preds = %bb
+ %tmp2 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp3 = insertelement <4 x float> %tmp2, float %val0, i32 undef
+ br label %bb7
+
+bb4: ; preds = %bb
+ %tmp5 = load volatile <4 x float>, <4 x float> addrspace(1)* undef
+ %tmp6 = insertelement <4 x float> %tmp5, float %val0, i32 undef
+ br label %bb7
+
+bb7: ; preds = %bb4, %bb1
+ %tmp8 = phi <4 x float> [ %tmp3, %bb1 ], [ %tmp6, %bb4 ]
+ store volatile <4 x float> %tmp8, <4 x float> addrspace(1)* undef
+ ret void
+}
+
+; FIXME: Should be able to fold zero input to movreld to inline imm?
+
+; CHECK-LABEL: {{^}}multi_same_block:
+; CHECK: s_load_dword [[ARG:s[0-9]+]]
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: s_add_i32 m0, [[ARG]], -16
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, [[ZERO]]
+
+; CHECK: s_add_i32 m0, [[ARG]], -14
+; CHECK: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+
+; CHECK: s_mov_b32 m0, -1
+; CHECK: ds_write_b32
+; CHECK: ds_write_b32
+; CHECK: s_endpgm
+define void @multi_same_block(i32 %arg) #0 {
+bb:
+ %tmp1 = add i32 %arg, -16
+ %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 0.000000e+00, i32 %tmp1
+ %tmp3 = add i32 %arg, -16
+ %tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float 0x3FB99999A0000000, i32 %tmp3
+ %tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>
+ %tmp6 = extractelement <6 x i32> %tmp5, i32 1
+ %tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>
+ %tmp8 = extractelement <6 x i32> %tmp7, i32 5
+ store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
+ store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.x() #1
+attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll Wed Jun 22 15:15:28 2016
@@ -1,17 +1,22 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-target triple = "amdgcn--"
+; This should end with an no-op sequence of exec mask manipulations
+; Mask should be in original state after executed unreachable block
; GCN-LABEL: {{^}}main:
-; GCN: BB0_3:
-; GCN-NEXT: s_branch [[LASTBB:BB[0-9]*_[0-9]*]]
-; GCN-NEXT: BB0_
-; GCN: [[LASTBB]]
-; GCN-NEXT: .Lfunc_end0:
-; ModuleID = 'bugpoint-reduced-simplified.bc'
-target triple = "amdgcn--"
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: [[RET_BB]]:
+; GCN-NEXT: ; return
+
+; GCN-NEXT: [[UNREACHABLE_BB]]:
+; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
+; GCN-NEXT: .Lfunc_end0
define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
main_body:
%p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
Modified: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wqm.ll?rev=273467&r1=273466&r2=273467&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll Wed Jun 22 15:15:28 2016
@@ -122,9 +122,13 @@ END:
;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]]
;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]]
;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]]
-;CHECK-NEXT: %ELSE
-;CHECK: store
-;CHECK: %END
+;CHECK-NEXT: mask branch [[END_BB:BB[0-9]+_[0-9]+]]
+;CHECK-NEXT: ; BB#3: ; %ELSE
+;CHECK: store_dword
+;CHECK: [[END_BB]]: ; %END
+;CHECK: s_or_b64 exec, exec,
+;CHECK: v_mov_b32_e32 v0
+;CHECK: ; return
define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
main_body:
%cmp = icmp eq i32 %z, 0
More information about the llvm-commits
mailing list