[llvm] r298729 - AMDGPU: Unify divergent function exits.
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 24 12:52:06 PDT 2017
Author: arsenm
Date: Fri Mar 24 14:52:05 2017
New Revision: 298729
URL: http://llvm.org/viewvc/llvm-project?rev=298729&view=rev
Log:
AMDGPU: Unify divergent function exits.
StructurizeCFG can't handle cases with multiple
returns creating regions with multiple exits.
Create a copy of UnifyFunctionExitNodes that only
unifies exit nodes that skips exit nodes
with uniform branch sources.
Added:
llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
Modified:
llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll
llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll
llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
Modified: llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td (original)
+++ llvm/trunk/include/llvm/IR/IntrinsicsAMDGPU.td Fri Mar 24 14:52:05 2017
@@ -705,6 +705,9 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1
def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
+// Represent unreachable in a divergent region.
+def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
+
// Emit 2.5 ulp, no denormal division. Should only be inserted by
// pass based on !fpmath metadata.
def int_amdgcn_fdiv_fast : Intrinsic<
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Fri Mar 24 14:52:05 2017
@@ -123,6 +123,9 @@ extern char &SIDebuggerInsertNopsID;
void initializeSIInsertWaitsPass(PassRegistry&);
extern char &SIInsertWaitsID;
+void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
+extern char &AMDGPUUnifyDivergentExitNodesID;
+
ImmutablePass *createAMDGPUAAWrapperPass();
void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Fri Mar 24 14:52:05 2017
@@ -132,6 +132,7 @@ extern "C" void LLVMInitializeAMDGPUTarg
initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);
+ initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
}
@@ -673,6 +674,10 @@ bool GCNPassConfig::addPreISel() {
// supported.
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+
+ // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+ // regions formed by them.
+ addPass(&AMDGPUUnifyDivergentExitNodesID);
addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
addPass(createSinkingPass());
addPass(createSITypeRewriter());
Added: llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp?rev=298729&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp Fri Mar 24 14:52:05 2017
@@ -0,0 +1,225 @@
+//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
+
+namespace {
+
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+ initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
+ }
+
+ // We can preserve non-critical-edgeness when we unify function exit nodes
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
+
+}
+
+char AMDGPUUnifyDivergentExitNodes::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+ "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+ "Unify divergent function exit nodes", false, false)
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+ // TODO: Preserve dominator tree.
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+
+ AU.addRequired<DivergenceAnalysis>();
+
+ // No divergent values are changed, only blocks and branch edges.
+ AU.addPreserved<DivergenceAnalysis>();
+
+ // We preserve the non-critical-edgeness property
+ AU.addPreservedID(BreakCriticalEdgesID);
+
+ // This is a cluster of orthogonal Transforms
+ AU.addPreservedID(LowerSwitchID);
+ FunctionPass::getAnalysisUsage(AU);
+
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+/// \returns true if \p BB is reachable through only uniform branches.
+/// XXX - Is there a more efficient way to find this?
+static bool isUniformlyReached(const DivergenceAnalysis &DA,
+ BasicBlock &BB) {
+ SmallVector<BasicBlock *, 8> Stack;
+ SmallPtrSet<BasicBlock *, 8> Visited;
+
+ for (BasicBlock *Pred : predecessors(&BB))
+ Stack.push_back(Pred);
+
+ while (!Stack.empty()) {
+ BasicBlock *Top = Stack.pop_back_val();
+ if (!DA.isUniform(Top->getTerminator()))
+ return false;
+
+ for (BasicBlock *Pred : predecessors(Top)) {
+ if (Visited.insert(Pred).second)
+ Stack.push_back(Pred);
+ }
+ }
+
+ return true;
+}
+
+static BasicBlock *unifyReturnBlockSet(Function &F,
+ ArrayRef<BasicBlock *> ReturningBlocks,
+ const TargetTransformInfo &TTI,
+ StringRef Name) {
+ // Otherwise, we need to insert a new basic block into the function, add a PHI
+ // nodes (if the function returns values), and convert all of the return
+ // instructions into unconditional branches.
+ //
+ BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+
+ PHINode *PN = nullptr;
+ if (F.getReturnType()->isVoidTy()) {
+ ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+ } else {
+ // If the function doesn't return void... add a PHI node to the block...
+ PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+ "UnifiedRetVal");
+ NewRetBlock->getInstList().push_back(PN);
+ ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+ }
+
+ // Loop over all of the blocks, replacing the return instruction with an
+ // unconditional branch.
+ //
+ for (BasicBlock *BB : ReturningBlocks) {
+ // Add an incoming element to the PHI node for every return instruction that
+ // is merging into this new block...
+ if (PN)
+ PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+ BB->getInstList().pop_back(); // Remove the return insn
+ BranchInst::Create(NewRetBlock, BB);
+ }
+
+ for (BasicBlock *BB : ReturningBlocks) {
+ // Cleanup possible branch to unconditional branch to the return.
+ SimplifyCFG(BB, TTI, 2);
+ }
+
+ return NewRetBlock;
+}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ if (PDT.getRoots().size() <= 1)
+ return false;
+
+ DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+
+ // Loop over all of the blocks in a function, tracking all of the blocks that
+ // return.
+ //
+ SmallVector<BasicBlock *, 4> ReturningBlocks;
+ SmallVector<BasicBlock *, 4> UnreachableBlocks;
+
+ for (BasicBlock *BB : PDT.getRoots()) {
+ if (isa<ReturnInst>(BB->getTerminator())) {
+ if (!isUniformlyReached(DA, *BB))
+ ReturningBlocks.push_back(BB);
+ } else if (isa<UnreachableInst>(BB->getTerminator())) {
+ if (!isUniformlyReached(DA, *BB))
+ UnreachableBlocks.push_back(BB);
+ }
+ }
+
+ if (!UnreachableBlocks.empty()) {
+ BasicBlock *UnreachableBlock = nullptr;
+
+ if (UnreachableBlocks.size() == 1) {
+ UnreachableBlock = UnreachableBlocks.front();
+ } else {
+ UnreachableBlock = BasicBlock::Create(F.getContext(),
+ "UnifiedUnreachableBlock", &F);
+ new UnreachableInst(F.getContext(), UnreachableBlock);
+
+ for (BasicBlock *BB : UnreachableBlocks) {
+ BB->getInstList().pop_back(); // Remove the unreachable inst.
+ BranchInst::Create(UnreachableBlock, BB);
+ }
+ }
+
+ if (!ReturningBlocks.empty()) {
+ // Don't create a new unreachable inst if we have a return. The
+ // structurizer/annotator can't handle the multiple exits
+
+ Type *RetTy = F.getReturnType();
+ Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+ UnreachableBlock->getInstList().pop_back(); // Remove the unreachable inst.
+
+ Function *UnreachableIntrin =
+ Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+
+ // Insert a call to an intrinsic tracking that this is an unreachable
+ // point, in case we want to kill the active lanes or something later.
+ CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
+
+ // Don't create a scalar trap. We would only want to trap if this code was
+ // really reached, but a scalar trap would happen even if no lanes
+ // actually reached here.
+ ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
+ ReturningBlocks.push_back(UnreachableBlock);
+ }
+ }
+
+ // Now handle return blocks.
+ if (ReturningBlocks.empty())
+ return false; // No blocks return
+
+ if (ReturningBlocks.size() == 1)
+ return false; // Already has a single return block
+
+ const TargetTransformInfo &TTI
+ = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+ return true;
+}
Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Fri Mar 24 14:52:05 2017
@@ -58,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPUInstrInfo.cpp
AMDGPUPromoteAlloca.cpp
AMDGPURegisterInfo.cpp
+ AMDGPUUnifyDivergentExitNodes.cpp
GCNHazardRecognizer.cpp
GCNSchedStrategy.cpp
R600ClauseMergePass.cpp
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrFormats.td Fri Mar 24 14:52:05 2017
@@ -138,19 +138,19 @@ class InstSI <dag outs, dag ins, string
let AsmVariantName = AMDGPUAsmVariants.Default;
}
-class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
- : InstSI<outs, ins, "", pattern> {
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+ : InstSI<outs, ins, asm, pattern> {
let isPseudo = 1;
let isCodeGenOnly = 1;
}
-class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
- : PseudoInstSI<outs, ins, pattern> {
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+ : PseudoInstSI<outs, ins, pattern, asm> {
let SALU = 1;
}
-class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
- : PseudoInstSI<outs, ins, pattern> {
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+ : PseudoInstSI<outs, ins, pattern, asm> {
let VALU = 1;
let Uses = [EXEC];
}
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Mar 24 14:52:05 2017
@@ -3803,16 +3803,11 @@ unsigned SIInstrInfo::getInstSizeInBytes
if (DescSize != 0 && DescSize != 4)
return DescSize;
- if (Opc == AMDGPU::WAVE_BARRIER)
- return 0;
-
// 4-byte instructions may have a 32-bit literal encoded after them. Check
// operands that coud ever be literals.
if (isVALU(MI) || isSALU(MI)) {
- if (isFixedSize(MI)) {
- assert(DescSize == 4);
+ if (isFixedSize(MI))
return DescSize;
- }
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
if (Src0Idx == -1)
@@ -3835,7 +3830,6 @@ unsigned SIInstrInfo::getInstSizeInBytes
return 4;
switch (Opc) {
- case AMDGPU::SI_MASK_BRANCH:
case TargetOpcode::IMPLICIT_DEF:
case TargetOpcode::KILL:
case TargetOpcode::DBG_VALUE:
Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Fri Mar 24 14:52:05 2017
@@ -152,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs),
let mayStore = 1;
let isBarrier = 1;
let isConvergent = 1;
+ let FixedSize = 1;
+ let Size = 0;
}
// SI pseudo instructions. These are used by the CFG structurizer pass
@@ -159,14 +161,15 @@ def WAVE_BARRIER : SPseudoInstSI<(outs),
// Dummy terminator instruction to use after control flow instructions
// replaced with exec mask operations.
-def SI_MASK_BRANCH : PseudoInstSI <
+def SI_MASK_BRANCH : VPseudoInstSI <
(outs), (ins brtarget:$target)> {
let isBranch = 0;
let isTerminator = 1;
let isBarrier = 0;
- let Uses = [EXEC];
let SchedRW = [];
let hasNoSchedulingInfo = 1;
+ let FixedSize = 1;
+ let Size = 0;
}
let isTerminator = 1 in {
@@ -260,6 +263,14 @@ def SI_PS_LIVE : PseudoInstSI <
let SALU = 1;
}
+def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
+ [(int_amdgcn_unreachable)],
+ "; divergent unreachable"> {
+ let Size = 0;
+ let hasNoSchedulingInfo = 1;
+ let FixedSize = 1;
+}
+
// Used as an isel pseudo to directly emit initialization with an
// s_mov_b32 rather than a copy of another initialized
// register. MachineCSE skips copies, and we don't want to have to
Modified: llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/branch-condition-and.ll Fri Mar 24 14:52:05 2017
@@ -15,12 +15,16 @@
; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
-;
-; TODO: The following sequence is a bug (missing s_endpgm)!
-;
-; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
-; GCN: [[BB]]:
-; GCN-NEXT: .Lfunc_end0:
+; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+
+; GCN-NEXT: [[BB5]]
+; GCN: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
bb:
%tmp = fcmp ogt float %arg, 0.000000e+00
@@ -29,6 +33,7 @@ bb:
br i1 %tmp3, label %bb4, label %bb5
bb4: ; preds = %bb
+ store volatile i32 4, i32 addrspace(3)* undef
unreachable
bb5: ; preds = %bb
Added: llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll?rev=298729&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll Fri Mar 24 14:52:05 2017
@@ -0,0 +1,710 @@
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %divergent.cond0 = icmp slt i32 %tmp16, 2
+ br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %divergent.cond1 = icmp eq i32 %tmp16, 1
+ br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %uniform.cond0 = icmp eq i32 %arg3, 2
+ br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %arg3, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+ %Pivot = icmp slt i32 %vgpr, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %vgpr, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store i32 9, i32 addrspace(1)* undef
+ ret float 1.0
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store i32 17, i32 addrspace(3)* undef
+ ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+ %uniform.cond = icmp slt i32 %sgpr, 2
+ br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %divergent.cond0 = icmp eq i32 %vgpr, 3
+ br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %divergent.cond1 = icmp eq i32 %vgpr, 7
+ br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store i32 9, i32 addrspace(1)* undef
+ ret float 1.0
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store i32 17, i32 addrspace(3)* undef
+ ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0: ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock: ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ %Pivot = icmp slt i32 %tmp16, 2
+ br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+indirect.exit1:
+ %load = load volatile i32, i32 addrspace(1)* undef
+ store volatile i32 %load, i32 addrspace(1)* undef
+ br label %exit1
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+ %tmp1 = add i32 0, %tmp
+ %tmp2 = zext i32 %tmp1 to i64
+ %tmp3 = add i64 0, %tmp2
+ %tmp4 = shl i64 %tmp3, 32
+ %tmp5 = ashr exact i64 %tmp4, 32
+ %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+ %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+ %tmp8 = sext i32 %tmp7 to i64
+ %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+ %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+ %tmp13 = zext i32 %tmp10 to i64
+ %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+ %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+ switch i32 %tmp16, label %exit1
+ [ i32 1, label %LeafBlock
+ i32 2, label %LeafBlock1
+ i32 3, label %exit0 ]
+
+LeafBlock: ; preds = %entry
+ %SwitchLeaf = icmp eq i32 %tmp16, 1
+ br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1: ; preds = %entry
+ %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+ br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 17, i32 addrspace(3)* undef
+ ret void
+
+exit1: ; preds = %LeafBlock, %LeafBlock1
+ store volatile i32 9, i32 addrspace(1)* undef
+ unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+ %vgpr0 = load volatile float, float addrspace(1)* undef
+ %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+ br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+ %vgpr1 = load volatile float, float addrspace(1)* undef
+ %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+ store volatile i32 33, i32 addrspace(1)* undef
+ br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+ store volatile i32 38, i32 addrspace(1)* undef
+ br label %divergent.ret0
+
+divergent.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1: ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow: ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock: ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+ %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+ %divergent.cond0 = icmp eq i32 %id.x, 0
+ br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+ %uniform.cond0 = icmp eq i32 %arg0, 4
+ br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+ %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+ %uniform.cond1 = icmp slt i32 %sgpr0, 1
+ br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+ %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+ %uniform.cond2 = icmp sge i32 %sgpr1, 4
+ store volatile i32 33, i32 addrspace(1)* undef
+ br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+ store volatile i32 38, i32 addrspace(1)* undef
+ br label %uniform.ret0
+
+uniform.ret0:
+ store volatile i32 11, i32 addrspace(3)* undef
+ ret void
+
+uniform.ret1:
+ store volatile i32 42, i32 addrspace(3)* undef
+ ret void
+
+divergent.ret:
+ store volatile i32 9, i32 addrspace(1)* undef
+ ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+ %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+ switch i32 %tmp, label %bb3 [
+ i32 2, label %bb1
+ i32 0, label %bb2
+ ]
+
+bb1: ; preds = %bb
+ unreachable
+
+bb2: ; preds = %bb
+ unreachable
+
+bb3: ; preds = %bb
+ switch i32 undef, label %bb5 [
+ i32 2, label %bb4
+ ]
+
+bb4: ; preds = %bb3
+ ret void
+
+bb5: ; preds = %bb3
+ unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
Modified: llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/ret_jump.ll Fri Mar 24 14:52:05 2017
@@ -4,20 +4,78 @@
; This should end with an no-op sequence of exec mask manipulations
; Mask should be in original state after executed unreachable block
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; %else
+
; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+ %i.i = extractelement <2 x i32> %arg7, i32 0
+ %j.i = extractelement <2 x i32> %arg7, i32 1
+ %i.f.i = bitcast i32 %i.i to float
+ %j.f.i = bitcast i32 %j.i to float
+ %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+ %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+ %p87 = fmul float undef, %p2.i
+ %p88 = fadd float %p87, undef
+ %p93 = fadd float %p88, undef
+ %p97 = fmul float %p93, undef
+ %p102 = fsub float %p97, undef
+ %p104 = fmul float %p102, undef
+ %p106 = fadd float 0.000000e+00, %p104
+ %p108 = fadd float undef, %p106
+ %uniform.cond = icmp slt i32 %arg17, 0
+ br i1 %uniform.cond, label %ret.bb, label %else
+
+else: ; preds = %main_body
+ %p124 = fmul float %p108, %p108
+ %p125 = fsub float %p124, undef
+ %divergent.cond = fcmp olt float %p125, 0.000000e+00
+ br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb: ; preds = %else
+ unreachable
+
+ret.bb: ; preds = %else, %main_body
+ ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
main_body:
%i.i = extractelement <2 x i32> %arg7, i32 0
%j.i = extractelement <2 x i32> %arg7, i32 1
@@ -33,18 +91,21 @@ main_body:
%p104 = fmul float %p102, undef
%p106 = fadd float 0.000000e+00, %p104
%p108 = fadd float undef, %p106
- br i1 undef, label %ENDIF69, label %ELSE
+ %uniform.cond = icmp slt i32 %arg18, 0
+ br i1 %uniform.cond, label %ret.bb, label %else
-ELSE: ; preds = %main_body
+else: ; preds = %main_body
%p124 = fmul float %p108, %p108
%p125 = fsub float %p124, undef
- %p126 = fcmp olt float %p125, 0.000000e+00
- br i1 %p126, label %ENDIF69, label %ELSE41
+ %divergent.cond = fcmp olt float %p125, 0.000000e+00
+ br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
-ELSE41: ; preds = %ELSE
+unreachable.bb: ; preds = %else
+ store volatile i32 8, i32 addrspace(3)* undef
unreachable
-ENDIF69: ; preds = %ELSE, %main_body
+ret.bb: ; preds = %else, %main_body
+ store volatile i32 11, i32 addrspace(1)* undef
ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
}
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll Fri Mar 24 14:52:05 2017
@@ -6,7 +6,7 @@
; OPT-NOT: call i1 @llvm.amdgcn.loop
; GCN-LABEL: {{^}}annotate_unreachable_noloop:
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
; GCN-NOT: s_endpgm
; GCN: .Lfunc_end0
define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
@@ -37,15 +37,52 @@ bb5:
; OPT-NOT: call i1 @llvm.amdgcn.loop
; GCN-LABEL: {{^}}annotate_ret_noloop:
-; GCN: s_cbranch_scc1
-; GCN: s_endpgm
-; GCN: .Lfunc_end1
+; GCN: load_dwordx4
+; GCN: v_cmp_nlt_f32
+; GCN: s_and_saveexec_b64
+; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: [[UNIFIED_RET]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN: .Lfunc_end
define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
br label %bb1
bb1: ; preds = %bb
+ %tmp2 = sext i32 %tmp to i64
+ %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+ %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+ %tmp5 = extractelement <4 x float> %tmp4, i32 1
+ store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+ %cmp = fcmp ogt float %tmp5, 1.0
+ br i1 %cmp, label %bb5, label %bb3
+
+bb3: ; preds = %bb1
+ %tmp6 = extractelement <4 x float> %tmp4, i32 2
+ %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+ br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
+
+bb4: ; preds = %bb3
+ ret void
+
+bb5: ; preds = %bb3, %bb1
+ ret void
+}
+
+; OPT-LABEL: @uniform_annotate_ret_noloop(
+; OPT-NOT: call i1 @llvm.amdgcn.loop
+
+; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
+; GCN: s_cbranch_scc1
+; GCN: s_endpgm
+; GCN: .Lfunc_end
+define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
+bb:
+ br label %bb1
+
+bb1: ; preds = %bb
%tmp2 = sext i32 %tmp to i64
%tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
%tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
Modified: llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll Fri Mar 24 14:52:05 2017
@@ -4,16 +4,17 @@
; GCN: v_cmp_eq_u32
; GCN: s_and_saveexec_b64
; GCN: s_xor_b64
-; GCN: ; mask branch [[RET:BB[0-9]+]]
-; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
-; GCN: [[RET]]
-; GCN: s_or_b64 exec, exec
-; GCN: s_endpgm
-
-; GCN: [[UNREACHABLE]]:
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
+; GCN: ; divergent unreachable
; GCN: s_waitcnt
+
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: s_endpgm
+
define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
bb:
%tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -29,18 +30,19 @@ ret:
}
; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
-; GCN: v_cmp_eq_u32
+; GCN: v_cmp_ne_u32
; GCN: s_and_saveexec_b64
; GCN: s_xor_b64
-; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
-; GCN-NEXT: ; %ret
-; GCN-NEXT: s_endpgm
-
-; GCN-NEXT: [[UNREACHABLE]]:
-; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
; GCN: ds_write_b32
+; GCN: ; divergent unreachable
; GCN: s_waitcnt
+
+; GCN: [[RETURN]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
bb:
%tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,7 +57,29 @@ unreachable:
unreachable
}
-; Function Attrs: nounwind readnone
+; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
+; GCN-NEXT: s_endpgm
+
+; GCN: [[UNREACHABLE]]:
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
+bb:
+ %tmp63 = icmp eq i32 %arg0, 32
+ br i1 %tmp63, label %unreachable, label %ret
+
+unreachable:
+ store volatile i32 0, i32 addrspace(3)* undef, align 4
+ unreachable
+
+ret:
+ ret void
+}
+
declare i32 @llvm.amdgcn.workitem.id.y() #1
attributes #0 = { nounwind }
Modified: llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll?rev=298729&r1=298728&r2=298729&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll Fri Mar 24 14:52:05 2017
@@ -64,29 +64,100 @@ end:
ret void
}
-; SI-LABEL: @simple_test_v_if
+; SI-LABEL: {{^}}simple_test_v_if:
; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
-; SI: BB{{[0-9]+_[0-9]+}}:
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
; SI: buffer_store_dword
-; SI: s_endpgm
+; SI-NEXT: s_waitcnt
-; SI: BB1_2:
+; SI-NEXT: {{^}}[[EXIT]]:
; SI: s_or_b64 exec, exec, [[BR_SREG]]
; SI: s_endpgm
define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%is.0 = icmp ne i32 %tid, 0
- br i1 %is.0, label %store, label %exit
+ br i1 %is.0, label %then, label %exit
+
+then:
+ %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+ store i32 999, i32 addrspace(1)* %gep
+ br label %exit
+
+exit:
+ ret void
+}
+
+; FIXME: It would be better to endpgm in the then block.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
+; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[EXIT]]:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %is.0 = icmp ne i32 %tid, 0
+ br i1 %is.0, label %then, label %exit
+
+then:
+ %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+ store i32 999, i32 addrspace(1)* %gep
+ ret void
+
+exit:
+ ret void
+}
+
+; Final block has more than a ret to execute. This was miscompiled
+; before function exit blocks were unified since the endpgm would
+; terminate the then wavefront before reaching the store.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
+; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
+; SI: ds_write_b32
+; SI: s_waitcnt
+
+; SI-NEXT: {{^}}[[FLOW]]:
+; SI-NEXT: s_or_saveexec_b64
+; SI-NEXT: s_xor_b64 exec, exec
+; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
+; SI: s_or_b64 exec, exec
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+ %tid = call i32 @llvm.amdgcn.workitem.id.x()
+ %is.0 = icmp ne i32 %tid, 0
+ br i1 %is.0, label %then, label %exit
-store:
+then:
%gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
store i32 999, i32 addrspace(1)* %gep
ret void
exit:
+ store volatile i32 7, i32 addrspace(3)* undef
ret void
}
More information about the llvm-commits
mailing list