[llvm] [AMDGPU] Eliminate likely-spurious execz checks via intrinsic argument (PR #123749)

Fabian Ritter via llvm-commits llvm-commits at lists.llvm.org
Mon Jan 27 01:46:12 PST 2025


https://github.com/ritter-x2a updated https://github.com/llvm/llvm-project/pull/123749

>From 5eefd5e5ff399dd7e26313a6d0ab3ad4f377344b Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 21 Jan 2025 07:55:16 -0500
Subject: [PATCH 1/5] [AMDGPU] Eliminate likely-spurious execz checks via
 intrinsic argument

Currently, we introduce branches to skip conditionally executed
instructions if the EXEC mask is zero and only eliminate them if the
scheduling model says that executing the skipped instructions is cheaper
than taking the branch instruction.

This patch adds a heuristic to SIAnnotateControlFlow to determine if the
lanes of a wavefront are likely to have dynamically varying values for
the branch condition.
This information is passed through new arguments/operands of the
amdgcn.if/else intrinsics and the SI_IF/ELSE pseudo instructions to
SILowerControlFlow, where the execz branch is inserted with
corresponding branch probabilities.
This causes SIPreEmitPeephole to eliminate the corresponding execz
branch if it is legal to do so.

This is an alternative to PR #117567, using a simpler heuristic and
passing the LikelyVarying information through new arguments for the
amdgcn.if/else intrinsics and the SI_IF/ELSE pseudo instructions instead
of abusing branch weight metadata.

Most test changes are caused by the new arguments for the amdgcn.if/else
intrinsics and the SI_IF/ELSE pseudo instructions; the LikelyVarying
argument is set to false/0 in these existing tests.
New tests for the functionality are in
conditional-mem-no-cbranch-execz.ll and
annotate-likely-varying-branches.ll.

For SWDEV-483228.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   4 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td     |   8 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   3 +
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   |  97 ++-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  10 +-
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp |  32 +-
 .../Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp |   4 +-
 llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp  |   4 +
 .../DotMachineCFG/AMDGPU/irreducible.mir      |   8 +-
 .../AMDGPU/MIR/control-flow-intrinsics.mir    |   4 +-
 .../AMDGPU/MIR/uses-value-from-cycle.mir      |   2 +-
 .../AMDGPU/control-flow-intrinsics.ll         |  20 +-
 .../AMDGPU/deprecated/hidden-diverge.mir      |   4 +-
 .../AMDGPU/deprecated/irreducible-1.mir       |   4 +-
 ...ergence-divergent-i1-used-outside-loop.mir |  40 +-
 .../GlobalISel/divergence-structurizer.mir    |  48 +-
 .../divergence-temporal-divergent-i1.mir      |   4 +-
 .../global-atomic-fadd.f32-no-rtn.ll          |  16 +-
 .../GlobalISel/global-atomic-fadd.f32-rtn.ll  |  12 +-
 .../AMDGPU/GlobalISel/legalize-brcond.mir     |  44 +-
 .../regbankselect-mui-regbanklegalize.mir     |  12 +-
 .../regbankselect-mui-regbankselect.mir       |  12 +-
 .../AMDGPU/GlobalISel/regbankselect-mui.mir   |  18 +-
 .../AMDGPU/GlobalISel/vni8-across-blocks.ll   |  18 +-
 .../annotate-likely-varying-branches.ll       | 621 ++++++++++++++++++
 .../atomic_optimizations_pixelshader.ll       |  33 +-
 .../block-should-not-be-in-alive-blocks.mir   |   4 +-
 .../branch-folding-implicit-def-subreg.ll     |   2 +-
 .../AMDGPU/cgp-addressing-modes-flat.ll       |  36 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |  12 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.mir   |  62 +-
 .../conditional-mem-no-cbranch-execz.ll       | 124 ++++
 .../AMDGPU/constant-fold-imm-immreg.mir       |   2 +-
 .../CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll |   2 +-
 llvm/test/CodeGen/AMDGPU/dpp_combine.mir      |   2 +-
 .../test/CodeGen/AMDGPU/dpp_combine_gfx11.mir |   2 +-
 .../AMDGPU/global-atomic-fadd.f32-no-rtn.ll   |  12 +-
 .../AMDGPU/global-atomic-fadd.f32-rtn.ll      |  12 +-
 .../i1_copy_phi_with_phi_incoming_value.mir   |   8 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |  33 +-
 .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll    |  43 +-
 .../lower-control-flow-live-intervals.mir     |  12 +-
 ...wer-control-flow-live-variables-update.mir |  12 +-
 ...ntrol-flow-live-variables-update.xfail.mir |   4 +-
 .../lower-control-flow-other-terminators.mir  |  10 +-
 .../AMDGPU/lower-i1-copies-clear-kills.mir    |   8 +-
 .../machine-sink-ignorable-exec-use.mir       |  28 +-
 .../CodeGen/AMDGPU/machine-sink-lane-mask.mir |   4 +-
 ...-var-out-of-divergent-loop-swdev407790.mir |  16 +-
 .../memory-legalizer-atomic-insert-end.mir    |   4 +-
 ...er-multiple-mem-operands-nontemporal-1.mir |   4 +-
 llvm/test/CodeGen/AMDGPU/mmra.ll              |   2 +-
 .../AMDGPU/multi-divergent-exit-region.ll     |  34 +-
 llvm/test/CodeGen/AMDGPU/multilevel-break.ll  |   2 +-
 .../CodeGen/AMDGPU/nested-loop-conditions.ll  |  10 +-
 .../CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir  |  12 +-
 .../opt-vgpr-live-range-verifier-error.mir    |   6 +-
 .../CodeGen/AMDGPU/phi-elimination-end-cf.mir |   8 +-
 .../CodeGen/AMDGPU/si-annotate-dbg-info.ll    |   6 +-
 .../CodeGen/AMDGPU/si-fix-sgpr-copies.mir     |   2 +-
 .../CodeGen/AMDGPU/si-lower-control-flow.mir  |  14 +-
 ...lower-i1-copies-order-of-phi-incomings.mir |   4 +-
 .../CodeGen/AMDGPU/si-lower-i1-copies.mir     |   2 +-
 .../si-opt-vgpr-liverange-bug-deadlanes.mir   |   8 +-
 .../si-optimize-vgpr-live-range-dbg-instr.mir |   8 +-
 .../si-unify-exit-multiple-unreachables.ll    |   8 +-
 .../si-unify-exit-return-unreachable.ll       |  12 +-
 .../AMDGPU/stale-livevar-in-twoaddr-pass.mir  |   2 +-
 .../stop-tail-duplicate-cfg-intrinsic.mir     |   4 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |  10 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |  24 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |  21 +-
 ...ine-function-info-long-branch-reg-debug.ll |   4 +-
 .../CodeGen/MIR/AMDGPU/machine-metadata.mir   |   4 +-
 llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir   |   4 +-
 75 files changed, 1288 insertions(+), 463 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index cc3584833202bf4..f7f9bc361b142eb 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3409,11 +3409,11 @@ def int_amdgcn_ashr_pk_u8_i32 : ClangBuiltin<"__builtin_amdgcn_ashr_pk_u8_i32">,
 // having side effects, which is sufficient to prevent optimizations without
 // having to mark them as convergent.
 def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
-  [llvm_i1_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
+  [llvm_i1_ty, llvm_i1_ty], [ImmArg<ArgIndex<1>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_anyint_ty],
-  [llvm_anyint_ty], [IntrWillReturn, IntrNoCallback, IntrNoFree]
+  [llvm_anyint_ty, llvm_i1_ty], [ImmArg<ArgIndex<1>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 def int_amdgcn_if_break : Intrinsic<[llvm_anyint_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index bec294a945d2fea..89304bfe51efe61 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -42,12 +42,12 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 def ImmOp : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
-def AMDGPUIfOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+def AMDGPUIfOp : SDTypeProfile<1, 3,
+  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>, SDTCisVT<3, OtherVT>]
 >;
 
-def AMDGPUElseOp : SDTypeProfile<1, 2,
-  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+def AMDGPUElseOp : SDTypeProfile<1, 3,
+  [SDTCisVT<0, i1>, SDTCisVT<1, i1>, SDTCisVT<2, i1>, SDTCisVT<3, OtherVT>]
 >;
 
 def AMDGPULoopOp : SDTypeProfile<0, 2,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index e9e47eaadd557f7..9fb756232e83bc9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7264,6 +7264,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
       Register Def = MI.getOperand(1).getReg();
       Register Use = MI.getOperand(3).getReg();
+      auto LikelyVarying = MI.getOperand(4).getImm();
 
       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
 
@@ -7275,11 +7276,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
         B.buildInstr(AMDGPU::SI_IF)
           .addDef(Def)
           .addUse(Use)
+          .addImm(LikelyVarying)
           .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
             .addDef(Def)
             .addUse(Use)
+            .addImm(LikelyVarying)
             .addMBB(UncondBrTarget);
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 4ff6fc32b642dd2..854eb7706416aeb 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsR600.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -36,6 +37,24 @@ namespace {
 using StackEntry = std::pair<BasicBlock *, Value *>;
 using StackVector = SmallVector<StackEntry, 16>;
 
+class LikelyVaryingHeuristic {
+public:
+  LikelyVaryingHeuristic(const Function &F, const GCNSubtarget &ST) {
+    IsSingleLaneExecution = ST.isSingleLaneExecution(F);
+  }
+
+  /// Check if \p V is likely to be have dynamically varying values among the
+  /// workitems in each wavefront.
+  bool isLikelyVarying(const Value *V);
+
+private:
+  bool IsSingleLaneExecution = false;
+
+  bool isRelevantSourceOfDivergence(const Value *V) const;
+
+  ValueMap<const Value *, bool> LikelyVaryingCache;
+};
+
 class SIAnnotateControlFlow {
 private:
   Function *F;
@@ -62,6 +81,8 @@ class SIAnnotateControlFlow {
 
   LoopInfo *LI;
 
+  LikelyVaryingHeuristic LVHeuristic;
+
   void initialize(const GCNSubtarget &ST);
 
   bool isUniform(BranchInst *T);
@@ -99,7 +120,7 @@ class SIAnnotateControlFlow {
 public:
   SIAnnotateControlFlow(Function &F, const GCNSubtarget &ST, DominatorTree &DT,
                         LoopInfo &LI, UniformityInfo &UA)
-      : F(&F), UA(&UA), DT(&DT), LI(&LI) {
+      : F(&F), UA(&UA), DT(&DT), LI(&LI), LVHeuristic(F, ST) {
     initialize(ST);
   }
 
@@ -186,9 +207,14 @@ bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
   if (isUniform(Term))
     return false;
 
+  // Check if it's likely that at least one lane will always follow the
+  // then-branch, i.e., the then-branch is never skipped completly.
+  Value *IsLikelyVarying =
+      LVHeuristic.isLikelyVarying(Term->getCondition()) ? BoolTrue : BoolFalse;
+
   IRBuilder<> IRB(Term);
   Value *IfCall = IRB.CreateCall(getDecl(If, Intrinsic::amdgcn_if, IntMask),
-                                 {Term->getCondition()});
+                                 {Term->getCondition(), IsLikelyVarying});
   Value *Cond = IRB.CreateExtractValue(IfCall, {0});
   Value *Mask = IRB.CreateExtractValue(IfCall, {1});
   Term->setCondition(Cond);
@@ -202,9 +228,16 @@ bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
     return false;
   }
 
+  Value *IncomingMask = popSaved();
+  // Check if it's likely that at least one lane will always follow the
+  // else-branch, i.e., the else-branch is never skipped completly.
+  Value *IsLikelyVarying =
+      LVHeuristic.isLikelyVarying(IncomingMask) ? BoolTrue : BoolFalse;
+
   IRBuilder<> IRB(Term);
-  Value *ElseCall = IRB.CreateCall(
-      getDecl(Else, Intrinsic::amdgcn_else, {IntMask, IntMask}), {popSaved()});
+  Value *ElseCall =
+      IRB.CreateCall(getDecl(Else, Intrinsic::amdgcn_else, {IntMask, IntMask}),
+                     {IncomingMask, IsLikelyVarying});
   Value *Cond = IRB.CreateExtractValue(ElseCall, {0});
   Value *Mask = IRB.CreateExtractValue(ElseCall, {1});
   Term->setCondition(Cond);
@@ -385,6 +418,62 @@ bool SIAnnotateControlFlow::run() {
   return Changed;
 }
 
+bool LikelyVaryingHeuristic::isRelevantSourceOfDivergence(
+    const Value *V) const {
+  auto *II = dyn_cast<IntrinsicInst>(V);
+  if (!II)
+    return false;
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_workitem_id_z:
+  case Intrinsic::r600_read_tidig_z:
+  case Intrinsic::amdgcn_workitem_id_y:
+  case Intrinsic::r600_read_tidig_y:
+  case Intrinsic::amdgcn_workitem_id_x:
+  case Intrinsic::r600_read_tidig_x:
+  case Intrinsic::amdgcn_mbcnt_hi:
+  case Intrinsic::amdgcn_mbcnt_lo:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool LikelyVaryingHeuristic::isLikelyVarying(const Value *V) {
+  if (IsSingleLaneExecution)
+    return false;
+
+  if (isRelevantSourceOfDivergence(V))
+    return true;
+
+  auto *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // ExtractValueInst and IntrinsicInst enable looking through the
+  // amdgcn_if/else intrinsics inserted by SIAnnotateControlFlow.
+  // This condition excludes PHINodes, which prevents infinite recursion.
+  if (!isa<BinaryOperator>(I) && !isa<UnaryOperator>(I) && !isa<CastInst>(I) &&
+      !isa<CmpInst>(I) && !isa<ExtractValueInst>(I) && !isa<IntrinsicInst>(I))
+    return false;
+
+  // Have we already checked V?
+  auto CacheEntry = LikelyVaryingCache.find(V);
+  if (CacheEntry != LikelyVaryingCache.end())
+    return CacheEntry->second;
+
+  // Does it use a likely varying Value?
+  bool Result = false;
+  for (const auto &Use : I->operands()) {
+    Result |= isLikelyVarying(Use);
+    if (Result)
+      break;
+  }
+
+  LikelyVaryingCache.insert({V, Result});
+  return Result;
+}
+
 PreservedAnalyses SIAnnotateControlFlowPass::run(Function &F,
                                                  FunctionAnalysisManager &FAM) {
   const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 15c77c2a723e411..5d0929a50f64101 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -417,8 +417,8 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
 let isTerminator = 1, isNotDuplicable = 1 in {
 
 def SI_IF: CFPseudoInstSI <
-  (outs SReg_1:$dst), (ins SReg_1:$vcc, brtarget:$target),
-  [(set i1:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
+  (outs SReg_1:$dst), (ins SReg_1:$vcc, i1imm:$likelyvarying, brtarget:$target),
+  [(set i1:$dst, (AMDGPUif i1:$vcc, (i1 timm:$likelyvarying), bb:$target))], 1, 1> {
   let Constraints = "";
   let Size = 12;
   let hasSideEffects = 1;
@@ -427,7 +427,7 @@ def SI_IF: CFPseudoInstSI <
 
 def SI_ELSE : CFPseudoInstSI <
   (outs SReg_1:$dst),
-  (ins SReg_1:$src, brtarget:$target), [], 1, 1> {
+  (ins SReg_1:$src, i1imm:$likelyvarying, brtarget:$target), [], 1, 1> {
   let Size = 12;
   let hasSideEffects = 1;
   let IsNeverUniform = 1;
@@ -1049,8 +1049,8 @@ def : GCNPat<
 >;
 
 def : GCNPat<
-  (AMDGPUelse i1:$src, bb:$target),
-  (SI_ELSE $src, $target)
+  (AMDGPUelse i1:$src, i1:$likelyvarying, bb:$target),
+  (SI_ELSE $src, $likelyvarying, $target)
 >;
 
 def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index f8878f32f829d13..d59f3504a2e342d 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -55,8 +55,11 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/BranchProbability.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -221,9 +224,11 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineOperand& Cond = MI.getOperand(1);
   assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
 
-  MachineOperand &ImpDefSCC = MI.getOperand(4);
+  MachineOperand &ImpDefSCC = MI.getOperand(5);
   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
 
+  bool LikelyVarying = MI.getOperand(2).getImm();
+
   // If there is only one use of save exec register and that use is SI_END_CF,
   // we can optimize SI_IF by returning the full saved exec mask instead of
   // just cleared bits.
@@ -281,7 +286,17 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
   // during SIPreEmitPeephole.
   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
-                            .add(MI.getOperand(2));
+                            .add(MI.getOperand(3));
+
+  if (LikelyVarying) {
+    MachineBasicBlock *ExeczDest = MI.getOperand(3).getMBB();
+    auto **E = MBB.succ_end();
+    for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
+      if (*SI == ExeczDest)
+        MBB.setSuccProbability(SI, BranchProbability::getZero());
+    }
+    MBB.normalizeSuccProbs();
+  }
 
   if (!LIS) {
     MI.eraseFromParent();
@@ -329,7 +344,9 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   if (LV)
     LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
 
-  MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
+  bool LikelyVarying = MI.getOperand(2).getImm();
+
+  MachineBasicBlock *DestBB = MI.getOperand(3).getMBB();
 
   MachineBasicBlock::iterator ElsePt(MI);
 
@@ -352,6 +369,15 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
           .addMBB(DestBB);
 
+  if (LikelyVarying) {
+    auto **E = MBB.succ_end();
+    for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
+      if (*SI == DestBB)
+        MBB.setSuccProbability(SI, BranchProbability::getZero());
+    }
+    MBB.normalizeSuccProbs();
+  }
+
   if (!LIS) {
     MI.eraseFromParent();
     return;
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index ff0b9b4a7574bfa..f0791efe7d191cd 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -184,7 +184,7 @@ MachineBasicBlock *
 SIOptimizeVGPRLiveRange::getElseTarget(MachineBasicBlock *MBB) const {
   for (auto &BR : MBB->terminators()) {
     if (BR.getOpcode() == AMDGPU::SI_ELSE)
-      return BR.getOperand(2).getMBB();
+      return BR.getOperand(3).getMBB();
   }
   return nullptr;
 }
@@ -682,7 +682,7 @@ bool SIOptimizeVGPRLiveRange::run(MachineFunction &MF) {
     for (auto &MI : MBB.terminators()) {
       // Detect the if-else blocks
       if (MI.getOpcode() == AMDGPU::SI_IF) {
-        MachineBasicBlock *IfTarget = MI.getOperand(2).getMBB();
+        MachineBasicBlock *IfTarget = MI.getOperand(3).getMBB();
         auto *Endif = getElseTarget(IfTarget);
         if (!Endif)
           continue;
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 2bb70c138a50c4b..55777d7da366b04 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -14,6 +14,7 @@
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/BranchProbability.h"
@@ -22,6 +23,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "si-pre-emit-peephole"
 
+STATISTIC(NumCBranchExeczElim, "Number of s_cbranch_execz eliminated.");
+
 namespace {
 
 class SIPreEmitPeephole : public MachineFunctionPass {
@@ -404,6 +407,7 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
     return false;
 
   LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
+  ++NumCBranchExeczElim;
   MI.eraseFromParent();
   SrcMBB.removeSuccessor(TrueMBB);
 
diff --git a/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir b/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir
index 56ea4b528ba8f14..d618f21fff694b9 100644
--- a/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir
+++ b/llvm/test/Analysis/DotMachineCFG/AMDGPU/irreducible.mir
@@ -14,10 +14,10 @@
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} -> Node{{[0-9A-Za-z]*}};
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} [shape=record,label="{bb.3:\l|\l successors: %bb.4(0x80000000)\l\l  %4:vgpr_32 = PHI %5:vgpr_32, %bb.1, %7:vgpr_32, %bb.2\l}"];
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} -> Node{{[0-9A-Za-z]*}};
-# MCFG-NEXT: Node{{[0-9A-Za-z]*}} [shape=record,label="{bb.4:\l|\l successors: %bb.2(0x40000000), %bb.5(0x40000000)\l\l  %8:vgpr_32 = V_AND_B32_e32 3, %1:vgpr_32, implicit $exec\l  %9:sreg_64 = V_CMP_EQ_U32_e64 %8:vgpr_32, 2, implicit $exec\l  %10:sreg_64 = SI_IF killed %9:sreg_64, %bb.2, implicit-def dead $exec,\l... implicit-def dead $scc, implicit $exec\l}"];
+# MCFG-NEXT: Node{{[0-9A-Za-z]*}} [shape=record,label="{bb.4:\l|\l successors: %bb.2(0x40000000), %bb.5(0x40000000)\l\l  %8:vgpr_32 = V_AND_B32_e32 3, %1:vgpr_32, implicit $exec\l  %9:sreg_64 = V_CMP_EQ_U32_e64 %8:vgpr_32, 2, implicit $exec\l  %10:sreg_64 = SI_IF killed %9:sreg_64, 0, %bb.2, implicit-def dead $exec,\l... implicit-def dead $scc, implicit $exec\l}"];
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} -> Node{{[0-9A-Za-z]*}};
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} -> Node{{[0-9A-Za-z]*}};
-# MCFG-NEXT: Node{{[0-9A-Za-z]*}} [shape=record,label="{bb.5:\l|\l successors: %bb.1(0x40000000), %bb.6(0x40000000)\l\l  %11:sreg_64 = V_CMP_EQ_U32_e64 %8:vgpr_32, 1, implicit $exec\l  %12:sreg_64 = SI_IF killed %11:sreg_64, %bb.1, implicit-def dead $exec,\l... implicit-def dead $scc, implicit $exec\l}"];
+# MCFG-NEXT: Node{{[0-9A-Za-z]*}} [shape=record,label="{bb.5:\l|\l successors: %bb.1(0x40000000), %bb.6(0x40000000)\l\l  %11:sreg_64 = V_CMP_EQ_U32_e64 %8:vgpr_32, 1, implicit $exec\l  %12:sreg_64 = SI_IF killed %11:sreg_64, 0, %bb.1, implicit-def dead $exec,\l... implicit-def dead $scc, implicit $exec\l}"];
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} -> Node{{[0-9A-Za-z]*}};
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} -> Node{{[0-9A-Za-z]*}};
 # MCFG-NEXT: Node{{[0-9A-Za-z]*}} [shape=record,label="{bb.6:\l|\l\l S_ENDPGM 0\l}"];
@@ -74,12 +74,12 @@ body:             |
 
     %50:vgpr_32 = V_AND_B32_e32 3, %2, implicit $exec
     %51:sreg_64 = V_CMP_EQ_U32_e64 %50, 2, implicit $exec
-    %52:sreg_64 = SI_IF killed %51:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %52:sreg_64 = SI_IF killed %51:sreg_64, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.5:
     successors: %bb.1, %bb.6
     %61:sreg_64 = V_CMP_EQ_U32_e64 %50, 1, implicit $exec
-    %62:sreg_64 = SI_IF killed %61:sreg_64, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %62:sreg_64 = SI_IF killed %61:sreg_64, 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.6:
     S_ENDPGM 0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/control-flow-intrinsics.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/control-flow-intrinsics.mir
index dec55e5662c8c65..58c710389b2b2b3 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/control-flow-intrinsics.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/control-flow-intrinsics.mir
@@ -10,7 +10,7 @@ body:             |
     ; CHECK-NOT: DIVERGENT: %1
     %1:sreg_64(s64) = G_IMPLICIT_DEF
     ; CHECK: DIVERGENT: {{.*}} SI_IF
-    %2:sreg_64 = SI_IF %1, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %2:sreg_64 = SI_IF %1, 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
 
   bb.1:
     SI_RETURN
@@ -30,7 +30,7 @@ body:             |
     ; CHECK-NOT: DIVERGENT: %1
     %1:sreg_64(s64) = G_IMPLICIT_DEF
     ; CHECK: DIVERGENT: {{.*}} SI_ELSE
-    %2:sreg_64 = SI_ELSE %1, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %2:sreg_64 = SI_ELSE %1, 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
 
   bb.1:
     SI_RETURN
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir
index c1acbb3a1575d58..0bd2c70e2a11c63 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/uses-value-from-cycle.mir
@@ -35,7 +35,7 @@ body:             |
     %13:_(s32) = G_LOAD %11(p0) :: (load (s32))
     %37:_(s32) = G_CONSTANT i32 0
     %14:sreg_32_xm0_xexec(s1) = G_ICMP intpred(slt), %13(s32), %37
-    %16:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %16:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.7
   
   bb.5:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/control-flow-intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/control-flow-intrinsics.ll
index b92daa64040e43d..9fadd900a7c64fe 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/control-flow-intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/control-flow-intrinsics.ll
@@ -16,14 +16,14 @@ entry:
 
 ; CHECK: for function 'test_if':
 ; CHECK: DIVERGENT: %cond = icmp eq i32 %arg0, 0
-; CHECK-NEXT: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+; CHECK-NEXT: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond, i1 false)
 ; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
 ; CHECK-NOT: DIVERGENT
 ; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
 define void @test_if(i32 %arg0) {
 entry:
   %cond = icmp eq i32 %arg0, 0
-  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond, i1 false)
   %if.bool = extractvalue { i1, i64 } %if, 0
   %if.mask = extractvalue { i1, i64 } %if, 1
   %if.bool.ext = zext i1 %if.bool to i32
@@ -35,14 +35,14 @@ entry:
 ; The result should still be treated as divergent, even with a uniform source.
 ; CHECK: for function 'test_if_uniform':
 ; CHECK-NOT: DIVERGENT
-; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond, i1 false)
 ; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
 ; CHECK-NOT: DIVERGENT
 ; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
 define amdgpu_ps void @test_if_uniform(i32 inreg %arg0) {
 entry:
   %cond = icmp eq i32 %arg0, 0
-  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond)
+  %if = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond, i1 false)
   %if.bool = extractvalue { i1, i64 } %if, 0
   %if.mask = extractvalue { i1, i64 } %if, 1
   %if.bool.ext = zext i1 %if.bool to i32
@@ -62,12 +62,12 @@ entry:
 }
 
 ; CHECK: for function 'test_else':
-; CHECK: DIVERGENT: %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+; CHECK: DIVERGENT: %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask, i1 false)
 ; CHECK: DIVERGENT:       %else.bool = extractvalue { i1, i64 } %else, 0
 ; CHECK: {{^[ \t]+}}%else.mask = extractvalue { i1, i64 } %else, 1
 define amdgpu_ps void @test_else(i64 inreg %mask) {
 entry:
-  %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+  %else = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask, i1 false)
   %else.bool = extractvalue { i1, i64 } %else, 0
   %else.mask = extractvalue { i1, i64 } %else, 1
   %else.bool.ext = zext i1 %else.bool to i32
@@ -78,13 +78,13 @@ entry:
 
 ; This case is probably always broken
 ; CHECK: for function 'test_else_divergent_mask':
-; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+; CHECK: DIVERGENT: %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask, i1 false)
 ; CHECK-NEXT: DIVERGENT: %if.bool = extractvalue { i1, i64 } %if, 0
 ; CHECK-NOT: DIVERGENT
 ; CHECK: DIVERGENT: %if.bool.ext = zext i1 %if.bool to i32
 define void @test_else_divergent_mask(i64 %mask) {
 entry:
-  %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask)
+  %if = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %mask, i1 false)
   %if.bool = extractvalue { i1, i64 } %if, 0
   %if.mask = extractvalue { i1, i64 } %if, 1
   %if.bool.ext = zext i1 %if.bool to i32
@@ -93,8 +93,8 @@ entry:
   ret void
 }
 
-declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #0
-declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #0
+declare { i1, i64 } @llvm.amdgcn.if.i64(i1, i1) #0
+declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64, i1) #0
 declare i64 @llvm.amdgcn.if.break.i64.i64(i1, i64) #1
 declare i1 @llvm.amdgcn.loop.i64(i64) #1
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir
index d1a61100a14cb8e..bd906141f65318d 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/hidden-diverge.mir
@@ -35,7 +35,7 @@ body:             |
     %23:sreg_64 = V_CMP_LT_I32_e64 %10(s32), killed %22, implicit $exec
     %1:vreg_1 = COPY %21
     %14:sreg_32 = IMPLICIT_DEF
-    %2:sreg_64 = SI_IF killed %23, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF killed %23, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -55,7 +55,7 @@ body:             |
     %6:vreg_1 = PHI %1, %bb.0, %4, %bb.1
     SI_END_CF %2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     %27:sreg_64 = COPY %6
-    %7:sreg_64 = SI_IF %27, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %7:sreg_64 = SI_IF %27, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir
index f784f05e12832bd..5d9a85c3d36ab76 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/deprecated/irreducible-1.mir
@@ -41,12 +41,12 @@ body:             |
 
     %50:vgpr_32 = V_AND_B32_e32 3, %2, implicit $exec
     %51:sreg_64 = V_CMP_EQ_U32_e64 %50, 2, implicit $exec
-    %52:sreg_64 = SI_IF killed %51:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %52:sreg_64 = SI_IF killed %51:sreg_64, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.5:
     successors: %bb.1, %bb.6
     %61:sreg_64 = V_CMP_EQ_U32_e64 %50, 1, implicit $exec
-    %62:sreg_64 = SI_IF killed %61:sreg_64, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %62:sreg_64 = SI_IF killed %61:sreg_64, 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.6:
     S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 5bbe3e488689983..7f91ccb3660b498 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -145,7 +145,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -205,7 +205,7 @@ body: |
     %8:_(s32) = G_PHI %6(s32), %bb.0, %9(s32), %bb.3
     %10:_(p1) = G_PHI %2(p1), %bb.0, %11(p1), %bb.3
     %12:sreg_32_xm0_xexec(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.3
-    %14:sreg_32_xm0_xexec(s32) = SI_IF %12(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %14:sreg_32_xm0_xexec(s32) = SI_IF %12(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -355,7 +355,7 @@ body: |
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY5]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -373,7 +373,7 @@ body: |
   ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %40(s1), %bb.8
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), 0, %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -405,7 +405,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), 0, %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -480,7 +480,7 @@ body: |
     %8:_(s32) = G_CONSTANT i32 0
     %9:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %8
     %10:_(s1) = G_CONSTANT i1 true
-    %11:sreg_32_xm0_xexec(s32) = SI_IF %9(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %11:sreg_32_xm0_xexec(s32) = SI_IF %9(s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.1
 
   bb.1:
@@ -494,7 +494,7 @@ body: |
 
     %13:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, %10(s1), %bb.0
     G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
-    %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), 0, %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
 
   bb.3:
@@ -510,7 +510,7 @@ body: |
     %25:_(s32) = G_LOAD %24(p1) :: (load (s32), addrspace 1)
     %26:_(s32) = G_CONSTANT i32 0
     %27:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %25(s32), %26
-    %28:sreg_32_xm0_xexec(s32) = SI_IF %27(s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+    %28:sreg_32_xm0_xexec(s32) = SI_IF %27(s1), 0, %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.4:
@@ -589,7 +589,7 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI2]]
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -609,7 +609,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), 0, %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -644,7 +644,7 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), 0, %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
@@ -680,7 +680,7 @@ body: |
     successors: %bb.3(0x40000000), %bb.4(0x40000000)
 
     %14:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %12
-    %15:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.3
 
   bb.3:
@@ -698,7 +698,7 @@ body: |
     %20:_(s1) = G_CONSTANT i1 true
     G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
     %21:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %1(s32), %12
-    %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+    %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), 0, %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.5
 
   bb.5:
@@ -725,7 +725,7 @@ body: |
     %28:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.6
     %29:_(s32) = G_PHI %12(s32), %bb.6
     G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
-    %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+    %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), 0, %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.8
 
   bb.8:
@@ -777,7 +777,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY8]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY8]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -849,7 +849,7 @@ body: |
     %9:_(s32) = G_PHI %10(s32), %bb.3, %7(s32), %bb.0
     %11:_(s32) = G_PHI %7(s32), %bb.0, %12(s32), %bb.3
     %13:sreg_32_xm0_xexec(s1) = G_PHI %8(s1), %bb.0, %14(s1), %bb.3
-    %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -941,7 +941,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -1002,7 +1002,7 @@ body: |
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
@@ -1033,7 +1033,7 @@ body: |
     %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
     %21:_(s32) = G_CONSTANT i32 0
     %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
-    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.3
 
   bb.2:
@@ -1080,6 +1080,6 @@ body: |
     %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
     %39:_(s32) = G_PHI %12(s32), %bb.5
     G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
-    %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 39ebf66411cc657..7be7ddfc63cbc20 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -22,7 +22,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -57,7 +57,7 @@ body: |
     %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
     %7:_(s32) = G_CONSTANT i32 0
     %8:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %4(s32), %7
-    %9:sreg_32_xm0_xexec(s32) = SI_IF %8(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %9:sreg_32_xm0_xexec(s32) = SI_IF %8(s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.1
 
   bb.1:
@@ -96,7 +96,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -106,7 +106,7 @@ body: |
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -152,14 +152,14 @@ body: |
     %5:_(s1) = G_IMPLICIT_DEF
     %6:_(s32) = G_CONSTANT i32 0
     %7:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %4(s32), %6
-    %8:sreg_32_xm0_xexec(s32) = SI_IF %7(s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    %8:sreg_32_xm0_xexec(s32) = SI_IF %7(s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.3
 
   bb.1:
     successors: %bb.2(0x40000000), %bb.4(0x40000000)
 
     %9:_(s1) = G_PHI %10(s1), %bb.3, %5(s1), %bb.0
-    %11:sreg_32_xm0_xexec(s32) = SI_ELSE %8(s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    %11:sreg_32_xm0_xexec(s32) = SI_ELSE %8(s32), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -226,7 +226,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -288,7 +288,7 @@ body: |
     %17:_(s32) = G_LOAD %16(p1) :: (load (s32), addrspace 1)
     %18:_(s32) = G_CONSTANT i32 0
     %19:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %17(s32), %18
-    %20:sreg_32_xm0_xexec(s32) = SI_IF %19(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %20:sreg_32_xm0_xexec(s32) = SI_IF %19(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -364,7 +364,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -379,7 +379,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -457,7 +457,7 @@ body: |
     %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
     %21:_(s32) = G_CONSTANT i32 0
     %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
-    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -470,7 +470,7 @@ body: |
     %28:_(s32) = G_LOAD %27(p1) :: (load (s32), addrspace 1)
     %29:_(s32) = G_CONSTANT i32 0
     %30:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %28(s32), %29
-    %31:sreg_32_xm0_xexec(s32) = SI_IF %30(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %31:sreg_32_xm0_xexec(s32) = SI_IF %30(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.3:
@@ -557,7 +557,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -572,7 +572,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -598,7 +598,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD2]](s32), [[C9]]
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP2]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP2]](s1), 0, %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -681,7 +681,7 @@ body: |
     %23:_(s32) = G_LOAD %22(p1) :: (load (s32), addrspace 1)
     %24:_(s32) = G_CONSTANT i32 0
     %25:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %23(s32), %24
-    %26:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %26:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -694,7 +694,7 @@ body: |
     %31:_(s32) = G_LOAD %30(p1) :: (load (s32), addrspace 1)
     %32:_(s32) = G_CONSTANT i32 0
     %33:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %31(s32), %32
-    %34:sreg_32_xm0_xexec(s32) = SI_IF %33(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %34:sreg_32_xm0_xexec(s32) = SI_IF %33(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.3:
@@ -717,7 +717,7 @@ body: |
     %42:_(s32) = G_LOAD %41(p1) :: (load (s32), addrspace 1)
     %43:_(s32) = G_CONSTANT i32 0
     %44:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %42(s32), %43
-    %45:sreg_32_xm0_xexec(s32) = SI_IF %44(s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+    %45:sreg_32_xm0_xexec(s32) = SI_IF %44(s1), 0, %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.6
 
   bb.5:
@@ -810,7 +810,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -871,7 +871,7 @@ body: |
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
@@ -902,7 +902,7 @@ body: |
     %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
     %21:_(s32) = G_CONSTANT i32 0
     %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
-    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.3
 
   bb.2:
@@ -949,7 +949,7 @@ body: |
     %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
     %39:_(s32) = G_PHI %12(s32), %bb.5
     G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
-    %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 ...
 
@@ -1070,7 +1070,7 @@ body: |
   ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY15]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY15]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   bb.0:
     successors: %bb.7(0x80000000)
@@ -1148,6 +1148,6 @@ body: |
     %17:_(s32) = G_PHI %6(s32), %bb.4, %16(s32), %bb.2, %6(s32), %bb.0
     %31:sreg_32_xm0_xexec(s1) = G_PHI %8(s1), %bb.0, %11(s1), %bb.2, %21(s1), %bb.4
     %14:_(s1) = G_CONSTANT i1 true
-    %15:sreg_32_xm0_xexec(s32) = SI_IF %31(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %31(s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.1
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index fb436623bed2d59..2e20857d799bf80 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -276,7 +276,7 @@ body: |
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INT]](s32), %bb.5
   ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
@@ -353,6 +353,6 @@ body: |
     %37:sreg_32_xm0_xexec(s1) = G_PHI %35(s1), %bb.5
     %38:_(s32) = G_PHI %13(s32), %bb.5
     G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
-    %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
index 6459110dd8bbb15..d458af7878e9c34 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll
@@ -64,7 +64,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX908-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
-  ; GFX908-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX908-NEXT:   S_BRANCH %bb.2
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2 (%ir-block.5):
@@ -108,7 +108,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec
   ; GFX908-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX908-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec
-  ; GFX908-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX908-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX908-NEXT:   S_BRANCH %bb.3
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.3 (%ir-block.31):
@@ -136,7 +136,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX90A-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
-  ; GFX90A-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.2
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2 (%ir-block.5):
@@ -180,7 +180,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec
   ; GFX90A-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX90A-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec
-  ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3 (%ir-block.31):
@@ -208,7 +208,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec_xnull = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX940-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
-  ; GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX940-NEXT: {{  $}}
   ; GFX940-NEXT: bb.2 (%ir-block.5):
@@ -252,7 +252,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX940-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY17]], implicit $exec
   ; GFX940-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY18]], implicit $exec
-  ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.3
   ; GFX940-NEXT: {{  $}}
   ; GFX940-NEXT: bb.3 (%ir-block.31):
@@ -281,7 +281,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE
-  ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.2 (%ir-block.5):
@@ -314,7 +314,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[V_ADD_F32_e64_4]], implicit $exec
   ; GFX11-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY12]], implicit $exec
-  ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.3 (%ir-block.24):
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
index e935245e30f1247..2e3963ba2ce2e86 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll
@@ -56,7 +56,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX90A-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
-  ; GFX90A-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.2
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2 (%ir-block.5):
@@ -102,7 +102,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
   ; GFX90A-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX90A-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
-  ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3 (%ir-block.32):
@@ -147,7 +147,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX940-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX940-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64_xexec = SI_PS_LIVE
-  ; GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX940-NEXT: {{  $}}
   ; GFX940-NEXT: bb.2 (%ir-block.5):
@@ -193,7 +193,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY18]], implicit $exec
   ; GFX940-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_HI_U32_B32_e64_]], [[COPY19]], implicit $exec
-  ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.3
   ; GFX940-NEXT: {{  $}}
   ; GFX940-NEXT: bb.3 (%ir-block.32):
@@ -238,7 +238,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GFX11-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32_xm0_xexec = SI_PS_LIVE
-  ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[SI_PS_LIVE]], 0, %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.2 (%ir-block.5):
@@ -283,7 +283,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY14]], implicit $exec
   ; GFX11-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 [[V_MBCNT_LO_U32_B32_e64_]], [[COPY15]], implicit $exec
-  ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.3
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.3 (%ir-block.29):
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
index 57bbe020dca8509..3932ddfc707d25c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-brcond.mir
@@ -17,6 +17,7 @@ body:             |
   ; WAVE64-NEXT:   G_BRCOND [[ICMP]](s1), %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
+  ;
   ; WAVE32-LABEL: name: legal_brcond_vcc
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -55,6 +56,7 @@ body: |
   ; WAVE64-NEXT:   G_BRCOND [[ICMP]](s1), %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
+  ;
   ; WAVE32-LABEL: name: legal_brcond_sgpr_s1
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -94,6 +96,7 @@ body: |
   ; WAVE64-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
+  ;
   ; WAVE32-LABEL: name: legal_brcond_sgpr_s32
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -128,10 +131,11 @@ body:             |
   ; WAVE64-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE64-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64-NEXT:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64-NEXT:   G_BR %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
+  ;
   ; WAVE32-LABEL: name: brcond_si_if
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -140,7 +144,7 @@ body:             |
   ; WAVE32-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE32-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE32-NEXT:   G_BR %bb.1
   ; WAVE32-NEXT: {{  $}}
   ; WAVE32-NEXT: bb.1:
@@ -150,7 +154,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2, 0
     G_BRCOND %3, %bb.1
 
   bb.1:
@@ -167,10 +171,11 @@ body:             |
   ; WAVE64-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE64-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64-NEXT:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE64-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64_xexec(s64) = SI_ELSE [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64-NEXT:   G_BR %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
+  ;
   ; WAVE32-LABEL: name: brcond_si_else
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -179,7 +184,7 @@ body:             |
   ; WAVE32-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE32-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE32-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_ELSE [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE32-NEXT:   G_BR %bb.1
   ; WAVE32-NEXT: {{  $}}
   ; WAVE32-NEXT: bb.1:
@@ -189,7 +194,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.else), %2, 0
     G_BRCOND %3, %bb.1
 
   bb.1:
@@ -217,6 +222,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
   ; WAVE64-NEXT:   S_NOP 0
+  ;
   ; WAVE32-LABEL: name: brcond_si_loop_brcond
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -276,6 +282,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
   ; WAVE64-NEXT:   S_NOP 0
+  ;
   ; WAVE32-LABEL: name: brcond_si_loop_brcond_back
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -334,6 +341,7 @@ body:             |
   ; WAVE64-NEXT:   G_BR %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
+  ;
   ; WAVE32-LABEL: name: brcond_si_loop_brcond_back_fallthrough
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -380,11 +388,12 @@ body:             |
   ; WAVE64-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64-NEXT:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
   ; WAVE64-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64-NEXT:   G_BR %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
   ; WAVE64-NEXT:   S_ENDPGM 0, implicit [[COPY2]](s32)
+  ;
   ; WAVE32-LABEL: name: brcond_si_if_need_insert_terminator_point
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -394,7 +403,7 @@ body:             |
   ; WAVE32-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
   ; WAVE32-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
-  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE32-NEXT:   G_BR %bb.1
   ; WAVE32-NEXT: {{  $}}
   ; WAVE32-NEXT: bb.1:
@@ -405,7 +414,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2, 0
     %5:_(s32) = COPY $vgpr2
     G_BRCOND %3, %bb.1
 
@@ -437,6 +446,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
   ; WAVE64-NEXT:   S_NOP 0
+  ;
   ; WAVE32-LABEL: name: brcond_si_loop_need_terminator_insert_point
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -487,7 +497,7 @@ body:             |
   ; WAVE64-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE64-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64-NEXT:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64-NEXT:   G_BR %bb.1
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
@@ -497,6 +507,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
   ; WAVE64-NEXT:   S_NOP 1
+  ;
   ; WAVE32-LABEL: name: brcond_si_if_negated
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -505,7 +516,7 @@ body:             |
   ; WAVE32-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE32-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE32-NEXT:   G_BR %bb.1
   ; WAVE32-NEXT: {{  $}}
   ; WAVE32-NEXT: bb.1:
@@ -521,7 +532,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2, 0
     %5:_(s1) = G_CONSTANT i1 true
     %6:_(s1) = G_XOR %3, %5
     G_BRCOND %6, %bb.2
@@ -544,7 +555,7 @@ body:             |
   ; WAVE64-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE64-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE64-NEXT:   [[ICMP:%[0-9]+]]:sreg_64_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE64-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE64-NEXT:   G_BR %bb.3
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.1:
@@ -559,6 +570,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.3:
   ; WAVE64-NEXT:   S_NOP 2
+  ;
   ; WAVE32-LABEL: name: brcond_si_if_br_negated
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -567,7 +579,7 @@ body:             |
   ; WAVE32-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
   ; WAVE32-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
   ; WAVE32-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; WAVE32-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s64) = SI_IF [[ICMP]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; WAVE32-NEXT:   G_BR %bb.3
   ; WAVE32-NEXT: {{  $}}
   ; WAVE32-NEXT: bb.1:
@@ -588,7 +600,7 @@ body:             |
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s1) = G_ICMP intpred(ne), %0, %1
-    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2
+    %3:_(s1), %4:_(s64) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.if), %2, 0
     %5:_(s1) = G_CONSTANT i1 true
     %6:_(s1) = G_XOR %3, %5
     G_BRCOND %6, %bb.2
@@ -626,6 +638,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
   ; WAVE64-NEXT:   S_NOP 0
+  ;
   ; WAVE32-LABEL: name: brcond_si_loop_brcond_negated
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
@@ -684,6 +697,7 @@ body:             |
   ; WAVE64-NEXT: {{  $}}
   ; WAVE64-NEXT: bb.2:
   ; WAVE64-NEXT:   S_NOP 0
+  ;
   ; WAVE32-LABEL: name: brcond_si_loop_brcond_br_negated
   ; WAVE32: bb.0:
   ; WAVE32-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir
index 3674fb9156f7a2f..6f097c46a90eef8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbanklegalize.mir
@@ -595,7 +595,7 @@ body: |
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]]
   ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   G_BR %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -620,7 +620,7 @@ body: |
     %4:sgpr(s32) = G_CONSTANT i32 0
     %9:vcc(s1) = G_ICMP intpred(eq), %0(s32), %4
     %5:sreg_32_xm0_xexec(s1) = COPY %9(s1)
-    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.1
 
   bb.1:
@@ -760,7 +760,7 @@ body: |
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[AMDGPU_COPY_VCC_SCC]](s1), implicit-def $scc
   ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -778,7 +778,7 @@ body: |
   ; CHECK-NEXT:   [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AMDGPU_COPY_VCC_SCC1:%[0-9]+]]:sreg_32(s1) = G_AMDGPU_COPY_VCC_SCC [[C7]](s32)
   ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[AMDGPU_COPY_VCC_SCC1]](s1)
-  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   G_BR %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -872,7 +872,7 @@ body: |
     %30:sreg_32(s1) = S_AND_B32 $exec_lo, %28(s1), implicit-def $scc
     %31:sreg_32(s1) = S_OR_B32 %29(s1), %30(s1), implicit-def $scc
     %32:sreg_32(s1) = COPY %31(s1)
-    %33:sreg_32_xm0_xexec(s32) = SI_IF %26(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %33:sreg_32_xm0_xexec(s32) = SI_IF %26(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -888,7 +888,7 @@ body: |
     %41:sgpr(s1) = G_CONSTANT i1 true
     %42:sreg_32(s1) = COPY %41(s1)
     %43:sreg_32(s1) = COPY %42(s1)
-    %44:sreg_32_xm0_xexec(s32) = SI_IF %40(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %44:sreg_32_xm0_xexec(s32) = SI_IF %40(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir
index 8f3495ea87eec49..73cb9db02fce3ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui-regbankselect.mir
@@ -538,7 +538,7 @@ body: |
   ; CHECK-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
   ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY3]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY3]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   G_BR %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -562,7 +562,7 @@ body: |
     %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
     %4:_(s32) = G_CONSTANT i32 0
     %5:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %4
-    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.1
 
   bb.1:
@@ -694,7 +694,7 @@ body: |
   ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
   ; CHECK-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -710,7 +710,7 @@ body: |
   ; CHECK-NEXT:   [[C6:%[0-9]+]]:sgpr(s1) = G_CONSTANT i1 true
   ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
   ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1)
-  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY10]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY10]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   G_BR %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -800,7 +800,7 @@ body: |
     %29:sreg_32(s1) = S_AND_B32 $exec_lo, %27(s1), implicit-def $scc
     %30:sreg_32(s1) = S_OR_B32 %28(s1), %29(s1), implicit-def $scc
     %31:sreg_32(s1) = COPY %30(s1)
-    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -815,7 +815,7 @@ body: |
     %39:_(s1) = G_CONSTANT i1 true
     %40:sreg_32(s1) = COPY %39(s1)
     %41:sreg_32(s1) = COPY %40(s1)
-    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir
index 1b22ee4b3fffcd9..5dacd52571f52eb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-mui.mir
@@ -872,7 +872,7 @@ body: |
   ; OLD_RBS-NEXT:   [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 0
   ; OLD_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; OLD_RBS-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]]
-  ; OLD_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; OLD_RBS-NEXT:   G_BR %bb.1
   ; OLD_RBS-NEXT: {{  $}}
   ; OLD_RBS-NEXT: bb.1:
@@ -900,7 +900,7 @@ body: |
   ; NEW_RBS-NEXT:   [[COPY3:%[0-9]+]]:vgpr(s32) = COPY [[C]](s32)
   ; NEW_RBS-NEXT:   [[ICMP:%[0-9]+]]:vcc(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY3]]
   ; NEW_RBS-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
-  ; NEW_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY4]](s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; NEW_RBS-NEXT:   G_BR %bb.1
   ; NEW_RBS-NEXT: {{  $}}
   ; NEW_RBS-NEXT: bb.1:
@@ -924,7 +924,7 @@ body: |
     %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
     %4:_(s32) = G_CONSTANT i32 0
     %5:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %4
-    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %6:sreg_32_xm0_xexec(s32) = SI_IF %5(s1), 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.1
 
   bb.1:
@@ -1099,7 +1099,7 @@ body: |
   ; OLD_RBS-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
   ; OLD_RBS-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; OLD_RBS-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; OLD_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; OLD_RBS-NEXT:   G_BR %bb.2
   ; OLD_RBS-NEXT: {{  $}}
   ; OLD_RBS-NEXT: bb.2:
@@ -1117,7 +1117,7 @@ body: |
   ; OLD_RBS-NEXT:   [[TRUNC1:%[0-9]+]]:sgpr(s1) = G_TRUNC [[C7]](s32)
   ; OLD_RBS-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[TRUNC1]](s1)
   ; OLD_RBS-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
-  ; OLD_RBS-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; OLD_RBS-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; OLD_RBS-NEXT:   G_BR %bb.4
   ; OLD_RBS-NEXT: {{  $}}
   ; OLD_RBS-NEXT: bb.3:
@@ -1214,7 +1214,7 @@ body: |
   ; NEW_RBS-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[AMDGPU_COPY_VCC_SCC]](s1), implicit-def $scc
   ; NEW_RBS-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; NEW_RBS-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; NEW_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY9]](s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; NEW_RBS-NEXT:   G_BR %bb.2
   ; NEW_RBS-NEXT: {{  $}}
   ; NEW_RBS-NEXT: bb.2:
@@ -1232,7 +1232,7 @@ body: |
   ; NEW_RBS-NEXT:   [[C7:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
   ; NEW_RBS-NEXT:   [[AMDGPU_COPY_VCC_SCC1:%[0-9]+]]:sreg_32(s1) = G_AMDGPU_COPY_VCC_SCC [[C7]](s32)
   ; NEW_RBS-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[AMDGPU_COPY_VCC_SCC1]](s1)
-  ; NEW_RBS-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; NEW_RBS-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY13]](s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; NEW_RBS-NEXT:   G_BR %bb.4
   ; NEW_RBS-NEXT: {{  $}}
   ; NEW_RBS-NEXT: bb.3:
@@ -1325,7 +1325,7 @@ body: |
     %29:sreg_32(s1) = S_AND_B32 $exec_lo, %27(s1), implicit-def $scc
     %30:sreg_32(s1) = S_OR_B32 %28(s1), %29(s1), implicit-def $scc
     %31:sreg_32(s1) = COPY %30(s1)
-    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %32:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.2
 
   bb.2:
@@ -1340,7 +1340,7 @@ body: |
     %39:_(s1) = G_CONSTANT i1 true
     %40:sreg_32(s1) = COPY %39(s1)
     %41:sreg_32(s1) = COPY %40(s1)
-    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    %42:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), 0, %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
     G_BR %bb.4
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcdebe..da4eccad6041822 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -68,10 +68,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v1, v2, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dword v1, v2, s[2:3]
-; GFX906-NEXT:  .LBB1_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -149,10 +148,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[2:3]
-; GFX906-NEXT:  .LBB3_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -185,10 +183,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v5, s[2:3]
-; GFX906-NEXT:  .LBB4_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
@@ -222,11 +219,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[0:1]
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[0:1] offset:16
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v9, s[2:3]
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v9, s[2:3] offset:16
-; GFX906-NEXT:  .LBB5_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
@@ -486,14 +482,13 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[10:11]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX906-NEXT:    s_and_b64 s[4:5], exec, vcc
 ; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX906-NEXT:  .LBB8_2: ; %Flow
+; GFX906-NEXT:  ; %bb.2: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_4
@@ -547,11 +542,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v5, s[10:11]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[12:13]
-; GFX906-NEXT:  .LBB9_3: ; %Flow
+; GFX906-NEXT:  ; %bb.3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB9_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll b/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
new file mode 100644
index 000000000000000..0bd80d2c6007c14
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
@@ -0,0 +1,621 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt --si-annotate-control-flow -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+
+declare { i1, i64 } @llvm.amdgcn.if.i64(i1, i1)
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even_ann_cf(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ann_cf(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[CF_VAL:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[PSEUDO_COND:%.*]] = extractvalue { i1, i64 } [[CF_VAL]], 0
+; CHECK-NEXT:    [[MASK:%.*]] = extractvalue { i1, i64 } [[CF_VAL]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[PSEUDO_COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[MASK]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  %cf_val = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %cond, i1 true)
+  %pseudo.cond = extractvalue { i1, i64 } %cf_val, 0
+  %mask = extractvalue { i1, i64 } %cf_val, 1
+  br i1 %pseudo.cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  call void @llvm.amdgcn.end.cf.i64(i64 %mask)
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_complex1(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_complex1(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[K:%.*]] = lshr i32 [[TID]], 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[K]], 15
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
+; CHECK-NEXT:    ret void
+;
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %k = lshr i32 %tid, 4
+  %cond = icmp ult i32 %k, 15
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_complex2(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_complex2(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[J:%.*]] = and i32 [[TID]], 15
+; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[J]], 15
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP1]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP1]], 1
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP3]])
+; CHECK-NEXT:    ret void
+;
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %j = and i32 %tid, 15
+  %cond = icmp ult i32 %j, 15
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even_only_reqd_wgsz(ptr addrspace(1) inreg %dest) !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_only_reqd_wgsz(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR3:[0-9]+]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying:
+define amdgpu_kernel void @cond_store_even_only_flat_wgsz(ptr addrspace(1) inreg %dest) #0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_only_flat_wgsz(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying, since the y dimension varies in each
+; wavefront with the required work group size:
+define amdgpu_kernel void @cond_store_even_ydim_small_wgs(ptr addrspace(1) inreg %dest) !reqd_work_group_size !1 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ydim_small_wgs(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR3]] !reqd_work_group_size [[META1:![0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 3
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID_Y]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 3
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid.y, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying, even though there are no attributes with
+; work group size information:
+define amdgpu_kernel void @cond_store_even_no_attributes(ptr addrspace(1) inreg %dest) {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_no_attributes(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is likely varying, even though the condition only depends on a
+; workitem id dimension that does not vary per wavefront (namely y):
+define amdgpu_kernel void @cond_store_even_ydim(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ydim(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID_Y]], 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid.y, 1
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is not likely varying, because its condition is directly
+; loaded from memory:
+define amdgpu_kernel void @cond_store_loaded(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_loaded(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LOOKUP_VALUE]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lookup.addr = getelementptr i32, ptr addrspace(1) %lookup, i64 %tid.ext
+  %lookup.value = load i32, ptr addrspace(1) %lookup.addr
+  %cond = icmp eq i32 %lookup.value, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The branch here is not likely varying, because its condition directly results from a PHI:
+define amdgpu_kernel void @cond_store_loop_phi(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_loop_phi(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ [[VAL_INC:%.*]], %[[LOOP]] ], [ [[TID]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_DEC:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL_INC]] = add i32 [[VAL]], 1
+; CHECK-NEXT:    [[IDX_DEC]] = sub i32 [[IDX]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp eq i32 [[IDX_DEC]], 0
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP_END:.*]], label %[[LOOP]]
+; CHECK:       [[LOOP_END]]:
+; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LOOKUP_VALUE]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  br label %loop
+loop:
+  %val = phi i32 [%val.inc, %loop], [%tid, %entry]
+  %idx = phi i32 [%idx.dec, %loop], [%n, %entry]
+  %val.inc = add i32 %val, 1
+  %idx.dec = sub i32 %idx, 1
+  %loop.cond = icmp eq i32 %idx.dec, 0
+  br i1 %loop.cond, label %loop.end, label %loop
+loop.end:
+  %lookup.addr = getelementptr i32, ptr addrspace(1) %lookup, i64 %tid.ext
+  %lookup.value = load i32, ptr addrspace(1) %lookup.addr
+  %cond = icmp eq i32 %lookup.value, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+; The then and else branches are likely varying here:
+define amdgpu_kernel void @cond_store_even_ifelse(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ifelse(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LSBIT:%.*]] = and i32 [[TID]], 1
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LSBIT]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE_IF:.*]], label %[[CF_MID:.*]]
+; CHECK:       [[DO_STORE_IF]]:
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[CF_MID]]
+; CHECK:       [[CF_MID]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP2]], i1 true)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i1, i64 } [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[DO_STORE_ELSE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE_ELSE]]:
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store.if, label %cf.mid
+do.store.if:
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %cf.mid
+cf.mid:
+  %notcond = phi i1 [0, %do.store.if], [1, %entry]
+  br i1 %notcond, label %do.store.else, label %exit
+do.store.else:
+  store i32 1, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+; The then and else branches are not likely varying here:
+define amdgpu_kernel void @cond_store_loaded_ifelse(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_loaded_ifelse(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LOOKUP_VALUE]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE_IF:.*]], label %[[CF_MID:.*]]
+; CHECK:       [[DO_STORE_IF]]:
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[CF_MID]]
+; CHECK:       [[CF_MID]]:
+; CHECK-NEXT:    [[TMP3:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP3]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i1, i64 } [[TMP3]], 1
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[DO_STORE_ELSE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE_ELSE]]:
+; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP5]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  %lookup.addr = getelementptr i32, ptr addrspace(1) %lookup, i64 %tid.ext
+  %lookup.value = load i32, ptr addrspace(1) %lookup.addr
+  %cond = icmp eq i32 %lookup.value, 0
+  br i1 %cond, label %do.store.if, label %cf.mid
+do.store.if:
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %cf.mid
+cf.mid:
+  %notcond = phi i1 [0, %do.store.if], [1, %entry]
+  br i1 %notcond, label %do.store.else, label %exit
+do.store.else:
+  store i32 1, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+
+attributes #0 = {"amdgpu-flat-work-group-size"="256,256"}
+!0 = !{i32 64, i32 4, i32 1}
+!1 = !{i32 8, i32 32, i32 1}
+;.
+; CHECK: [[META0]] = !{i32 64, i32 4, i32 1}
+; CHECK: [[META1]] = !{i32 8, i32 32, i32 1}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 4ae08a0375c8c39..292192c0a244dbe 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -62,13 +62,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX89-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX89-NEXT:    ; implicit-def: $vgpr1
 ; GFX89-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX89-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX89-NEXT:  ; %bb.2:
 ; GFX89-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
 ; GFX89-NEXT:    s_mul_i32 s12, s12, 5
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX89-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
-; GFX89-NEXT:  .LBB0_3:
+; GFX89-NEXT:  ; %bb.3:
 ; GFX89-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
 ; GFX89-NEXT:    v_readfirstlane_b32 s4, v1
@@ -97,13 +96,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1064-NEXT:  ; %bb.2:
 ; GFX1064-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
 ; GFX1064-NEXT:    s_mul_i32 s12, s12, 5
 ; GFX1064-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX1064-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
-; GFX1064-NEXT:  .LBB0_3:
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -132,13 +130,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, s10, 0
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1032-NEXT:  ; %bb.2:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s10, s10
 ; GFX1032-NEXT:    s_mul_i32 s10, s10, 5
 ; GFX1032-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX1032-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 glc
-; GFX1032-NEXT:  .LBB0_3:
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -170,14 +167,13 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1164-NEXT:    v_mbcnt_hi_u32_b32 v0, s13, v0
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1164-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1164-NEXT:  ; %bb.2:
 ; GFX1164-NEXT:    s_bcnt1_i32_b64 s12, s[12:13]
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_mul_i32 s12, s12, 5
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
-; GFX1164-NEXT:  .LBB0_3:
+; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s4, v1
@@ -209,14 +205,13 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa
 ; GFX1132-NEXT:    ; implicit-def: $vgpr1
 ; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v0
-; GFX1132-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1132-NEXT:  ; %bb.2:
 ; GFX1132-NEXT:    s_bcnt1_i32_b32 s10, s10
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_mul_i32 s10, s10, 5
 ; GFX1132-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v1, off, s[4:7], 0 glc
-; GFX1132-NEXT:  .LBB0_3:
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s4, v1
@@ -294,11 +289,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX8-NEXT:    ; implicit-def: $vgpr0
 ; GFX8-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX8-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
-; GFX8-NEXT:  .LBB1_3:
+; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_readfirstlane_b32 s4, v0
@@ -349,11 +343,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    ; implicit-def: $vgpr0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX9-NEXT:  ; %bb.2:
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX9-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
-; GFX9-NEXT:  .LBB1_3:
+; GFX9-NEXT:  ; %bb.3:
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v0
@@ -408,11 +401,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX1064-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1064-NEXT:  ; %bb.2:
 ; GFX1064-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX1064-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
-; GFX1064-NEXT:  .LBB1_3:
+; GFX1064-NEXT:  ; %bb.3:
 ; GFX1064-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1064-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1064-NEXT:    s_waitcnt vmcnt(0)
@@ -458,11 +450,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1032-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032-NEXT:    s_and_saveexec_b32 s9, vcc_lo
-; GFX1032-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1032-NEXT:  ; %bb.2:
 ; GFX1032-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX1032-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 glc
-; GFX1032-NEXT:  .LBB1_3:
+; GFX1032-NEXT:  ; %bb.3:
 ; GFX1032-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX1032-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1032-NEXT:    s_waitcnt vmcnt(0)
@@ -527,11 +518,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1164-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX1164-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164-NEXT:    s_and_saveexec_b64 s[10:11], vcc
-; GFX1164-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1164-NEXT:  ; %bb.2:
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX1164-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
-; GFX1164-NEXT:  .LBB1_3:
+; GFX1164-NEXT:  ; %bb.3:
 ; GFX1164-NEXT:    s_or_b64 exec, exec, s[10:11]
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1164-NEXT:    v_readfirstlane_b32 s4, v0
@@ -585,11 +575,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
 ; GFX1132-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1132-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132-NEXT:    s_and_saveexec_b32 s9, vcc_lo
-; GFX1132-NEXT:    s_cbranch_execz .LBB1_3
 ; GFX1132-NEXT:  ; %bb.2:
 ; GFX1132-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX1132-NEXT:    buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
-; GFX1132-NEXT:  .LBB1_3:
+; GFX1132-NEXT:  ; %bb.3:
 ; GFX1132-NEXT:    s_or_b32 exec_lo, exec_lo, s9
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1132-NEXT:    v_readfirstlane_b32 s4, v0
diff --git a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
index 6483ff28c0de056..c7d81933d1e1873 100644
--- a/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
+++ b/llvm/test/CodeGen/AMDGPU/block-should-not-be-in-alive-blocks.mir
@@ -84,7 +84,7 @@ body:             |
     %0:sgpr_64 = COPY $sgpr4_sgpr5
     %1:vgpr_32 = COPY $vgpr0
     %2:sreg_32 = V_CMP_NE_U32_e64 0, %1, implicit $exec
-    %3:sreg_32 = SI_IF killed %2, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_32 = SI_IF killed %2, 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.1:
@@ -119,7 +119,7 @@ body:             |
   bb.5:
     successors: %bb.1(0x40000000), %bb.7(0x40000000)
 
-    %14:sreg_32 = SI_ELSE %3, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %14:sreg_32 = SI_ELSE %3, 0, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.6:
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 98136347ab702ca..dc63ee30da56f6b 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -895,7 +895,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.64.bb159:
-  ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
+  ; GFX90A-NEXT:   successors: %bb.67(0x80000000), %bb.65(0x00000000)
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index fdae1696a5a4921..c887bfe5758f6bc 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -513,12 +513,11 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX7-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0xfff, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX7-NEXT:    flat_load_sbyte v4, v[2:3]
-; GFX7-NEXT:  .LBB3_2: ; %endif
+; GFX7-NEXT:  ; %bb.2: ; %endif
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x1000, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -534,12 +533,11 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0xfff, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_sbyte v4, v[2:3]
-; GFX8-NEXT:  .LBB3_2: ; %endif
+; GFX8-NEXT:  ; %bb.2: ; %endif
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x1000, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -555,10 +553,9 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
 ; GFX9-NEXT:    flat_load_sbyte v4, v[2:3] offset:4095
-; GFX9-NEXT:  .LBB3_2: ; %endif
+; GFX9-NEXT:  ; %bb.2: ; %endif
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -574,12 +571,11 @@ define void @test_sink_flat_small_max_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_load_sbyte v4, v[2:3] offset:2047
-; GFX10-NEXT:  .LBB3_2: ; %endif
+; GFX10-NEXT:  ; %bb.2: ; %endif
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@@ -634,12 +630,11 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX7-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX7-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, 0x1000, v2
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX7-NEXT:    flat_load_sbyte v4, v[2:3]
-; GFX7-NEXT:  .LBB4_2: ; %endif
+; GFX7-NEXT:  ; %bb.2: ; %endif
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x61a7c, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -655,12 +650,11 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x1000, v2
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX8-NEXT:    flat_load_sbyte v4, v[2:3]
-; GFX8-NEXT:  .LBB4_2: ; %endif
+; GFX8-NEXT:  ; %bb.2: ; %endif
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x61a7c, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -676,12 +670,11 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0x1000, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    flat_load_sbyte v4, v[2:3]
-; GFX9-NEXT:  .LBB4_2: ; %endif
+; GFX9-NEXT:  ; %bb.2: ; %endif
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x61000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -697,12 +690,11 @@ define void @test_sink_flat_small_max_plus_1_flat_offset(ptr %out, ptr %in) #1 {
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; GFX10-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x1000, v2
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    flat_load_sbyte v4, v[2:3]
-; GFX10-NEXT:  .LBB4_2: ; %endif
+; GFX10-NEXT:  ; %bb.2: ; %endif
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x61800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
@@ -757,12 +749,11 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ; GFX7-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX7-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX7-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX7-NEXT:  ; %bb.1: ; %if
 ; GFX7-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX7-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX7-NEXT:    flat_load_sbyte v6, v[2:3]
-; GFX7-NEXT:  .LBB5_2: ; %endif
+; GFX7-NEXT:  ; %bb.2: ; %endif
 ; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x1000, v0
 ; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -778,12 +769,11 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX8-NEXT:  ; %bb.1: ; %if
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX8-NEXT:    flat_load_sbyte v6, v[2:3]
-; GFX8-NEXT:  .LBB5_2: ; %endif
+; GFX8-NEXT:  ; %bb.2: ; %endif
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x1000, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -799,12 +789,11 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX9-NEXT:  ; %bb.1: ; %if
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX9-NEXT:    flat_load_sbyte v6, v[2:3]
-; GFX9-NEXT:  .LBB5_2: ; %endif
+; GFX9-NEXT:  ; %bb.2: ; %endif
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
@@ -820,12 +809,11 @@ define void @test_sinkable_flat_reg_offset(ptr %out, ptr %in, i64 %reg) #1 {
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
-; GFX10-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX10-NEXT:  ; %bb.1: ; %if
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
 ; GFX10-NEXT:    flat_load_sbyte v6, v[2:3]
-; GFX10-NEXT:  .LBB5_2: ; %endif
+; GFX10-NEXT:  ; %bb.2: ; %endif
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 50c9c0cb64ccd60..4d2893522b67100 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -187,13 +187,12 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    buffer_store_dword v4, v[3:4], s[0:3], 0 addr64
 ; GCN-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-NEXT:    s_cbranch_execz .LBB1_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.inner.then
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, 1
 ; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:4
-; GCN-NEXT:  .LBB1_3: ; %bb.inner.end
+; GCN-NEXT:  ; %bb.3: ; %bb.inner.end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_mov_b32 s0, s2
 ; GCN-NEXT:    s_mov_b32 s1, s2
@@ -384,7 +383,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GCN-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GCN-NEXT:    s_cbranch_execz .LBB2_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.else
 ; GCN-NEXT:    s_mov_b32 s6, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
@@ -393,7 +391,7 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    buffer_store_dword v0, v[3:4], s[4:7], 0 addr64 offset:8
 ; GCN-NEXT:    ; implicit-def: $vgpr3_vgpr4
-; GCN-NEXT:  .LBB2_3: ; %Flow
+; GCN-NEXT:  ; %bb.3: ; %Flow
 ; GCN-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
 ; GCN-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-NEXT:  ; %bb.4: ; %bb.then
@@ -616,7 +614,6 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GCN-NEXT:    buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:12
 ; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GCN-NEXT:    s_cbranch_execz .LBB3_3
 ; GCN-NEXT:  ; %bb.2: ; %bb.inner.then2
 ; GCN-NEXT:    s_mov_b32 s10, 0
 ; GCN-NEXT:    s_mov_b32 s11, 0xf000
@@ -624,7 +621,7 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-NEXT:    s_mov_b32 s9, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, 4
 ; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[8:11], 0 addr64 offset:16
-; GCN-NEXT:  .LBB3_3: ; %Flow
+; GCN-NEXT:  ; %bb.3: ; %Flow
 ; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GCN-NEXT:    ; implicit-def: $vgpr0
@@ -639,11 +636,10 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 2, v0
 ; GCN-NEXT:    buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 offset:4
 ; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GCN-NEXT:    s_cbranch_execz .LBB3_7
 ; GCN-NEXT:  ; %bb.6: ; %bb.inner.then
 ; GCN-NEXT:    v_mov_b32_e32 v0, 2
 ; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8
-; GCN-NEXT:  .LBB3_7: ; %Flow1
+; GCN-NEXT:  ; %bb.7: ; %Flow1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GCN-NEXT:  .LBB3_8: ; %bb.outer.end
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
index b278bfca7f7a334..cc8a9524a1b970e 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.mir
@@ -36,12 +36,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -90,12 +90,12 @@ body:             |
   ; GCN-NEXT:   $exec = S_OR_B64 $exec, [[COPY]], implicit-def $scc
   ; GCN-NEXT:   S_ENDPGM 0
   bb.0:
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -148,12 +148,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -210,12 +210,12 @@ body:             |
     successors: %bb.1, %bb.4
     liveins: $vgpr0, $sgpr0_sgpr1
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -273,12 +273,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -334,12 +334,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -393,13 +393,13 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
     %2:vreg_128 = IMPLICIT_DEF
-    %3:sreg_64 = SI_IF undef %4:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64 = SI_IF undef %4:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -452,12 +452,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -565,7 +565,7 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.2
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2
@@ -573,12 +573,12 @@ body:             |
 
   bb.2:
     successors: %bb.3, %bb.6
-    %2:sreg_64 = SI_ELSE %0:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_ELSE %0:sreg_64, 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.3:
     successors: %bb.3, %bb.4
 
-    %3:sreg_64 = SI_IF undef %4:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64 = SI_IF undef %4:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.4:
     successors: %bb.5
@@ -646,7 +646,7 @@ body:             |
     S_BRANCH %bb.6
 
   bb.1:
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
     S_BRANCH %bb.6
@@ -660,7 +660,7 @@ body:             |
   bb.5:
 
   bb.6:
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.0
     S_ENDPGM 0
 
@@ -705,12 +705,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -774,12 +774,12 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.4
 
-    %0:sreg_64 = SI_IF undef %1:sreg_64, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF undef %1:sreg_64, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.1:
     successors: %bb.2, %bb.3
 
-    %2:sreg_64 = SI_IF undef %3:sreg_64, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF undef %3:sreg_64, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
   bb.2:
 
@@ -910,7 +910,7 @@ body: |
 
     %0:vgpr_32 = IMPLICIT_DEF
     %1:sreg_64 = V_CMP_EQ_U32_e64 0, killed %0:vgpr_32, implicit $exec
-    %2:sreg_64 = SI_IF %1:sreg_64, %bb.14, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %2:sreg_64 = SI_IF %1:sreg_64, 0, %bb.14, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -918,7 +918,7 @@ body: |
 
     %3:vgpr_32 = IMPLICIT_DEF
     %4:sreg_64 = V_CMP_EQ_U32_e64 0, killed %3:vgpr_32, implicit $exec
-    %5:sreg_64 = SI_IF killed %4:sreg_64, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64 = SI_IF killed %4:sreg_64, 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
@@ -926,7 +926,7 @@ body: |
 
     %6:vgpr_32 = IMPLICIT_DEF
     %7:sreg_64 = V_CMP_EQ_U32_e64 0, killed %6:vgpr_32, implicit $exec
-    %8:sreg_64 = SI_IF killed %7:sreg_64, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %8:sreg_64 = SI_IF killed %7:sreg_64, 0, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
@@ -934,7 +934,7 @@ body: |
 
     %9:vgpr_32 = IMPLICIT_DEF
     %10:sreg_64 = V_CMP_EQ_U32_e64 0, killed %9:vgpr_32, implicit $exec
-    %11:sreg_64 = SI_IF killed %10:sreg_64, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %11:sreg_64 = SI_IF killed %10:sreg_64, 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
@@ -970,7 +970,7 @@ body: |
 
     %12:vgpr_32 = IMPLICIT_DEF
     %13:sreg_64 = V_CMP_EQ_U32_e64 0, killed %12:vgpr_32, implicit $exec
-    %14:sreg_64 = SI_IF killed %13:sreg_64, %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %14:sreg_64 = SI_IF killed %13:sreg_64, 0, %bb.12, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.11
 
   bb.10:
@@ -986,7 +986,7 @@ body: |
   bb.12:
     successors: %bb.10, %bb.13
 
-    %15:sreg_64 = SI_ELSE %14:sreg_64, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %15:sreg_64 = SI_ELSE %14:sreg_64, 0, %bb.13, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.10
 
   bb.13:
diff --git a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
new file mode 100644
index 000000000000000..8fc9e357969beea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+; Check that simple conditional memory accesses that are guarded by likely
+; varying conditions are not lowered with an s_cbranch_execz to bypass them.
+; Instructions like s_waitcnt vmcnt(0) block the elimination of s_cbranch_execz.
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare i32 @llvm.amdgcn.workitem.id.y()
+
+define amdgpu_kernel void @cond_ops(ptr addrspace(1) inreg %x, ptr addrspace(1) inreg %y) !reqd_work_group_size !0 {
+; CHECK-LABEL: cond_ops:
+; CHECK:       ; %bb.5:
+; CHECK-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_branch .LBB0_0
+; CHECK-NEXT:    .p2align 8
+; CHECK-NEXT:  ; %bb.6:
+; CHECK-NEXT:  .LBB0_0: ; %entry
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; CHECK-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; CHECK-NEXT:    v_lshl_or_b32 v5, v0, 6, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 4, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v4, 4, v5
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:  ; %bb.1: ; %do.load
+; CHECK-NEXT:    global_load_dwordx4 v[0:3], v4, s[8:9]
+; CHECK-NEXT:  ; %bb.2: ; %post.load
+; CHECK-NEXT:    s_or_b64 exec, exec, s[0:1]
+; CHECK-NEXT:    v_and_b32_e32 v5, 15, v5
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v5
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB0_4
+; CHECK-NEXT:  ; %bb.3: ; %do.store
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    global_store_dwordx4 v4, v[0:3], s[10:11]
+; CHECK-NEXT:  .LBB0_4: ; %exit
+; CHECK-NEXT:    s_endpgm
+entry:
+  %tid.x = tail call range(i32 0, 64) i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call range(i32 0, 4) i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %k = lshr i32 %tid, 4
+  %j = and i32 %tid, 15
+  %load.cond = icmp ult i32 %k, 15
+  %tid.ext = zext nneg i32 %tid to i64
+  %my.x = getelementptr <4 x float>, ptr addrspace(1) %x, i64 %tid.ext
+  br i1 %load.cond, label %do.load, label %post.load
+do.load:
+  %loaded = load <4 x float>, ptr addrspace(1) %my.x
+  br label %post.load
+post.load:
+  %maybe.loaded = phi <4 x float> [ %loaded, %do.load ], [ zeroinitializer, %entry ]
+  %my.y = getelementptr <4 x float>, ptr addrspace(1) %y, i64 %tid.ext
+  %store.cond = icmp ult i32 %j, 15
+  br i1 %store.cond, label %do.store, label %exit
+do.store:
+  store <4 x float> %maybe.loaded, ptr addrspace(1) %my.y
+  br label %exit
+exit:
+  ret void
+}
+
+define amdgpu_kernel void @cond_store_even_ifelse(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: cond_store_even_ifelse:
+; CHECK:       ; %bb.5:
+; CHECK-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_branch .LBB1_0
+; CHECK-NEXT:    .p2align 8
+; CHECK-NEXT:  ; %bb.6:
+; CHECK-NEXT:  .LBB1_0: ; %entry
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 4, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xffc0, v1
+; CHECK-NEXT:    s_movk_i32 s0, 0x3ff
+; CHECK-NEXT:    v_and_or_b32 v0, v0, s0, v1
+; CHECK-NEXT:    v_and_b32_e32 v4, 1, v0
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_lshl_add_u64 v[0:1], s[8:9], 0, v[2:3]
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; CHECK-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; CHECK-NEXT:  ; %bb.1: ; %do.store.if
+; CHECK-NEXT:    global_store_dword v[0:1], v3, off
+; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; CHECK-NEXT:  ; %bb.2: ; %cf.mid
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; CHECK-NEXT:  ; %bb.3: ; %do.store.else
+; CHECK-NEXT:    v_mov_b32_e32 v2, 1
+; CHECK-NEXT:    global_store_dword v[0:1], v2, off
+; CHECK-NEXT:  ; %bb.4: ; %exit
+; CHECK-NEXT:    s_endpgm
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %lsbit = and i32 %tid, 1
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  %cond = icmp eq i32 %lsbit, 0
+  br i1 %cond, label %do.store.if, label %cf.mid
+do.store.if:
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %cf.mid
+cf.mid:
+  %notcond = phi i1 [0, %do.store.if], [1, %entry]
+  br i1 %notcond, label %do.store.else, label %exit
+do.store.else:
+  store i32 1, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+
+!0 = !{i32 64, i32 4, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 3db2b6ed9ab4ba8..15b1d04c497656b 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -407,7 +407,7 @@ body:             |
     liveins: $vgpr0, $sgpr4_sgpr5
 
     %19:sreg_64 = IMPLICIT_DEF
-    %0:sreg_64 = SI_IF killed %19, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_64 = SI_IF killed %19, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
index 012f33952f990eb..eadcbc4c1fcdc6e 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll
@@ -10,7 +10,7 @@ define i32 @divergent_lshr_and_cmp(i32 %x) {
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; GCN-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 2, [[COPY]], implicit $exec
   ; GCN-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 killed [[V_AND_B32_e64_]], 0, implicit $exec
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1.out.true:
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
index 179d0becf6693a4..5938eb2447ec96e 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine.mir
@@ -425,7 +425,7 @@ body: |
     %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
 
     %7:sreg_64 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
-    %8:sreg_64 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %8:sreg_64 = SI_IF %7, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 41b61f2e09a3d32..e3e6f19f1cedb82 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -378,7 +378,7 @@ body: |
     %6:vgpr_32 = V_OR_B32_e32 %3, %1, implicit $exec
 
     %7:sreg_32 = V_CMP_EQ_U32_e64 %5, %6, implicit $exec
-    %8:sreg_32 = SI_IF %7, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %8:sreg_32 = SI_IF %7, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
index 8a7762fb4b6c7f0..a301a680f871c8c 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll
@@ -45,7 +45,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX908-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
   ; GFX908-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
-  ; GFX908-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX908-NEXT:   S_BRANCH %bb.1
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.1 (%ir-block.5):
@@ -77,7 +77,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX908-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
   ; GFX908-NEXT:   early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX908-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
-  ; GFX908-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX908-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], -1, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX908-NEXT:   S_BRANCH %bb.2
   ; GFX908-NEXT: {{  $}}
   ; GFX908-NEXT: bb.2 (%ir-block.31):
@@ -107,7 +107,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
   ; GFX90A_GFX940-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
-  ; GFX90A_GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.1
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT: bb.1 (%ir-block.5):
@@ -139,7 +139,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX90A_GFX940-NEXT:   [[V_READLANE_B32_:%[0-9]+]]:sreg_32 = V_READLANE_B32 killed [[V_ADD_F32_e64_5]], killed [[S_MOV_B32_1]]
   ; GFX90A_GFX940-NEXT:   early-clobber %1:sgpr_32 = STRICT_WWM killed [[V_READLANE_B32_]], implicit $exec
   ; GFX90A_GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
-  ; GFX90A_GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX90A_GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_EQ_U32_e64_]], -1, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A_GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX90A_GFX940-NEXT: {{  $}}
   ; GFX90A_GFX940-NEXT: bb.2 (%ir-block.31):
@@ -169,7 +169,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
   ; GFX11_GFX12-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
   ; GFX11_GFX12-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32 = SI_PS_LIVE
-  ; GFX11_GFX12-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11_GFX12-NEXT:   S_BRANCH %bb.1
   ; GFX11_GFX12-NEXT: {{  $}}
   ; GFX11_GFX12-NEXT: bb.1 (%ir-block.5):
@@ -195,7 +195,7 @@ define amdgpu_ps void @global_atomic_fadd_f32_saddr_no_rtn_atomicrmw(ptr addrspa
   ; GFX11_GFX12-NEXT:   [[V_ADD_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[V_ADD_F32_e64_3]], 0, killed [[V_PERMLANEX16_B32_e64_]], 0, 0, implicit $mode, implicit $exec
   ; GFX11_GFX12-NEXT:   early-clobber %1:vgpr_32 = STRICT_WWM killed [[V_ADD_F32_e64_4]], implicit $exec
   ; GFX11_GFX12-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
-  ; GFX11_GFX12-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11_GFX12-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_EQ_U32_e64_]], -1, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11_GFX12-NEXT:   S_BRANCH %bb.2
   ; GFX11_GFX12-NEXT: {{  $}}
   ; GFX11_GFX12-NEXT: bb.2 (%ir-block.24):
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
index 1fb34abb41a2de3..bd6365de2b482ec 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll
@@ -47,7 +47,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
   ; GFX90A-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
   ; GFX90A-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.1
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.1 (%ir-block.5):
@@ -82,7 +82,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX90A-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX90A-NEXT:   [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
   ; GFX90A-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX90A-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.2
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2 (%ir-block.32):
@@ -129,7 +129,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_64 = SI_PS_LIVE
   ; GFX940-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[SI_PS_LIVE]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.1
   ; GFX940-NEXT: {{  $}}
   ; GFX940-NEXT: bb.1 (%ir-block.5):
@@ -164,7 +164,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_HI_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX940-NEXT:   [[COPY8:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
   ; GFX940-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX940-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX940-NEXT:   S_BRANCH %bb.2
   ; GFX940-NEXT: {{  $}}
   ; GFX940-NEXT: bb.2 (%ir-block.32):
@@ -211,7 +211,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[COPY3:%[0-9]+]]:sreg_64_xexec_xnull = COPY [[REG_SEQUENCE]]
   ; GFX11-NEXT:   [[SI_PS_LIVE:%[0-9]+]]:sreg_32 = SI_PS_LIVE
   ; GFX11-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[SI_PS_LIVE]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.1
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.1 (%ir-block.5):
@@ -248,7 +248,7 @@ define amdgpu_ps float @global_atomic_fadd_f32_saddr_rtn_atomicrmw(ptr addrspace
   ; GFX11-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_EQ_U32_e64 killed [[V_MBCNT_LO_U32_B32_e64_]], [[S_MOV_B32_]], implicit $exec
   ; GFX11-NEXT:   [[COPY6:%[0-9]+]]:vreg_1 = COPY [[V_CMP_EQ_U32_e64_]]
   ; GFX11-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
-  ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX11-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_EQ_U32_e64_]], -1, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX11-NEXT:   S_BRANCH %bb.2
   ; GFX11-NEXT: {{  $}}
   ; GFX11-NEXT: bb.2 (%ir-block.29):
diff --git a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir
index ac0931b6022f1ec..bcfcb3b74fc9c65 100644
--- a/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir
+++ b/llvm/test/CodeGen/AMDGPU/i1_copy_phi_with_phi_incoming_value.mir
@@ -17,7 +17,7 @@ body:             |
   ; GCN-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]](s32)
   ; GCN-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY1]](s32), [[S_LOAD_DWORD_IMM]], implicit $exec
   ; GCN-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_I32_e64_]], 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
@@ -46,7 +46,7 @@ body:             |
   ; GCN-NEXT:   S_BARRIER
   ; GCN-NEXT:   ATOMIC_FENCE 4, 2
   ; GCN-NEXT:   [[COPY6:%[0-9]+]]:sreg_64 = COPY %18
-  ; GCN-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[COPY6]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_64 = SI_IF [[COPY6]], 0, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.4
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.4:
@@ -85,7 +85,7 @@ body:             |
     %9:sreg_64 = V_CMP_LT_I32_e64 %2:vgpr_32(s32), %3:sreg_32_xm0_xexec, implicit $exec
     %4:sreg_64 = S_MOV_B64 0
     %17:vreg_1 = COPY %4:sreg_64, implicit $exec
-    %16:sreg_64 = SI_IF killed %9:sreg_64, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %16:sreg_64 = SI_IF killed %9:sreg_64, 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -118,7 +118,7 @@ body:             |
     S_BARRIER
     ATOMIC_FENCE 4, 2
     %23:sreg_64 = COPY %22:vreg_1
-    %24:sreg_64 = SI_IF %23:sreg_64, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %24:sreg_64 = SI_IF %23:sreg_64, 0, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index ea3d57d127151b3..958dee759468c5f 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -54,27 +54,28 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
 ; GFX11-NEXT:    v_mov_b32_e32 v31, v0
-; GFX11-NEXT:    s_load_b32 s19, s[16:17], 0x24
+; GFX11-NEXT:    s_load_b32 s24, s[16:17], 0x24
+; GFX11-NEXT:    s_mov_b32 s18, s14
 ; GFX11-NEXT:    s_mov_b32 s12, s13
 ; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
-; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
+; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
 ; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX11-NEXT:    s_mov_b32 s20, 0
 ; GFX11-NEXT:    s_mov_b32 s0, -1
 ; GFX11-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_mul_lo_u32 v0, s19, v0
+; GFX11-NEXT:    v_mul_lo_u32 v0, s24, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11-NEXT:    s_cbranch_execz .LBB2_13
 ; GFX11-NEXT:  ; %bb.1: ; %bb14
 ; GFX11-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
-; GFX11-NEXT:    s_mov_b32 s18, 0
+; GFX11-NEXT:    s_mov_b32 s19, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
-; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
+; GFX11-NEXT:    s_cselect_b32 s25, -1, 0
 ; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
 ; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
 ; GFX11-NEXT:  ; %bb.2: ; %bb15
@@ -83,13 +84,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s13, s14
+; GFX11-NEXT:    s_mov_b32 s13, s18
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s21, s14
 ; GFX11-NEXT:    s_mov_b32 s14, s15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s14, s21
 ; GFX11-NEXT:    s_mov_b32 s2, -1
 ; GFX11-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11-NEXT:    s_branch .LBB2_12
@@ -125,11 +124,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s22
 ; GFX11-NEXT:    s_mul_i32 s0, s0, s20
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s0, s19, s0
+; GFX11-NEXT:    s_or_b32 s0, s24, s0
 ; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
 ; GFX11-NEXT:    s_mov_b32 s0, s1
 ; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s25
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
@@ -146,8 +145,8 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_and_b32 s1, s8, s1
 ; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
+; GFX11-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX11-NEXT:    s_cselect_b32 s1, s14, s13
 ; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
@@ -156,8 +155,8 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
 ; GFX11-NEXT:    v_readfirstlane_b32 s13, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s13, s19, s13
+; GFX11-NEXT:    v_readfirstlane_b32 s14, v2
+; GFX11-NEXT:    s_cselect_b32 s13, s14, s13
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
 ; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
@@ -177,10 +176,10 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
 ; GFX11-NEXT:  ; %bb.11: ; %Flow6
-; GFX11-NEXT:    s_mov_b32 s18, -1
+; GFX11-NEXT:    s_mov_b32 s19, -1
 ; GFX11-NEXT:  .LBB2_12: ; %Flow11
 ; GFX11-NEXT:    s_and_b32 s20, s2, exec_lo
-; GFX11-NEXT:    s_or_not1_b32 s0, s18, exec_lo
+; GFX11-NEXT:    s_or_not1_b32 s0, s19, exec_lo
 ; GFX11-NEXT:  .LBB2_13: ; %Flow9
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11-NEXT:    s_and_saveexec_b32 s3, s0
@@ -191,7 +190,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, f0 at gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, f0 at gotpcrel32@hi+12
-; GFX11-NEXT:    s_mov_b32 s13, s14
+; GFX11-NEXT:    s_mov_b32 s13, s18
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s14, s15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index e4602f20f8a37cf..6f119f5b7d5030d 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -7895,7 +7895,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_add_co_i32 s1, s3, 4
 ; GFX12-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX12-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX12-NEXT:  ; %bb.1:
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s3, s6
@@ -7905,7 +7904,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX12-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX12-NEXT:  .LBB29_2:
+; GFX12-NEXT:  ; %bb.2:
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_mov_b32 s7, exec_lo
@@ -7916,7 +7915,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v2
-; GFX12-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX12-NEXT:  ; %bb.3:
 ; GFX12-NEXT:    s_bcnt1_i32_b32 s0, s7
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -7925,8 +7923,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX12-NEXT:    ds_add_f32 v2, v1
-; GFX12-NEXT:  .LBB29_4:
-; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:  ; %bb.4:
 ; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX12-NEXT:    s_mov_b32 s1, exec_lo
@@ -7987,7 +7984,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX940-NEXT:    s_add_i32 s3, s3, 4
 ; GFX940-NEXT:    ; implicit-def: $vgpr1
 ; GFX940-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX940-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX940-NEXT:  ; %bb.1:
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX940-NEXT:    s_lshl_b32 s8, s3, 3
@@ -7995,7 +7991,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX940-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX940-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX940-NEXT:  .LBB29_2:
+; GFX940-NEXT:  ; %bb.2:
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8004,7 +8000,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v1, s9, v1
 ; GFX940-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
 ; GFX940-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
-; GFX940-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX940-NEXT:  ; %bb.3:
 ; GFX940-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
 ; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
@@ -8012,7 +8007,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX940-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX940-NEXT:    ds_add_f32 v2, v1
-; GFX940-NEXT:  .LBB29_4:
+; GFX940-NEXT:  ; %bb.4:
 ; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX940-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -8068,7 +8063,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_i32 s1, s3, 4
 ; GFX11-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX11-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX11-NEXT:  ; %bb.1:
 ; GFX11-NEXT:    s_bcnt1_i32_b32 s3, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -8077,7 +8071,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX11-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX11-NEXT:  .LBB29_2:
+; GFX11-NEXT:  ; %bb.2:
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_mov_b32 s7, exec_lo
@@ -8086,7 +8080,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
 ; GFX11-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v2
-; GFX11-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX11-NEXT:  ; %bb.3:
 ; GFX11-NEXT:    s_bcnt1_i32_b32 s0, s7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -8095,7 +8088,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mul_f32 v1, 0x42280000, v1
 ; GFX11-NEXT:    ds_add_f32 v2, v1
-; GFX11-NEXT:  .LBB29_4:
+; GFX11-NEXT:  ; %bb.4:
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX11-NEXT:    v_bfrev_b32_e32 v1, 1
@@ -8151,7 +8144,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_i32 s1, s3, 4
 ; GFX10-NEXT:    s_and_saveexec_b32 s0, vcc_lo
-; GFX10-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX10-NEXT:  ; %bb.1:
 ; GFX10-NEXT:    s_bcnt1_i32_b32 s3, s6
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s3
@@ -8159,7 +8151,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX10-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX10-NEXT:  .LBB29_2:
+; GFX10-NEXT:  ; %bb.2:
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_mov_b32 s7, exec_lo
@@ -8168,7 +8160,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_and_saveexec_b32 s6, s0
-; GFX10-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX10-NEXT:  ; %bb.3:
 ; GFX10-NEXT:    s_bcnt1_i32_b32 s0, s7
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
@@ -8176,7 +8167,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX10-NEXT:    ds_add_f32 v2, v1
-; GFX10-NEXT:  .LBB29_4:
+; GFX10-NEXT:  ; %bb.4:
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
@@ -8229,7 +8220,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:    s_add_i32 s3, s3, 4
 ; GFX90A-NEXT:    ; implicit-def: $vgpr1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX90A-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX90A-NEXT:  ; %bb.1:
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX90A-NEXT:    s_lshl_b32 s8, s3, 3
@@ -8237,7 +8227,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX90A-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX90A-NEXT:  .LBB29_2:
+; GFX90A-NEXT:  ; %bb.2:
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8246,7 +8236,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v1, s9, v1
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
 ; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
-; GFX90A-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX90A-NEXT:  ; %bb.3:
 ; GFX90A-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
 ; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
@@ -8254,7 +8243,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX90A-NEXT:    ds_add_f32 v2, v1
-; GFX90A-NEXT:  .LBB29_4:
+; GFX90A-NEXT:  ; %bb.4:
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -8309,7 +8298,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:    s_add_i32 s3, s3, 4
 ; GFX908-NEXT:    ; implicit-def: $vgpr1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX908-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX908-NEXT:  ; %bb.1:
 ; GFX908-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX908-NEXT:    s_lshl_b32 s8, s3, 3
@@ -8317,7 +8305,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX908-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX908-NEXT:  .LBB29_2:
+; GFX908-NEXT:  ; %bb.2:
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8326,7 +8314,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v1, s9, v1
 ; GFX908-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
 ; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
-; GFX908-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX908-NEXT:  ; %bb.3:
 ; GFX908-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
 ; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
@@ -8334,7 +8321,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX908-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX908-NEXT:    ds_add_f32 v2, v1
-; GFX908-NEXT:  .LBB29_4:
+; GFX908-NEXT:  ; %bb.4:
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX908-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
@@ -8390,7 +8377,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:    ; implicit-def: $vgpr1
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX8-NEXT:    s_cbranch_execz .LBB29_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
 ; GFX8-NEXT:    s_lshl_b32 s8, s3, 3
@@ -8398,7 +8384,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX8-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX8-NEXT:  .LBB29_2:
+; GFX8-NEXT:  ; %bb.2:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8407,7 +8393,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:    v_mbcnt_hi_u32_b32 v1, s9, v1
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v1
 ; GFX8-NEXT:    s_and_saveexec_b64 s[6:7], s[0:1]
-; GFX8-NEXT:    s_cbranch_execz .LBB29_4
 ; GFX8-NEXT:  ; %bb.3:
 ; GFX8-NEXT:    s_bcnt1_i32_b64 s0, s[8:9]
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
@@ -8415,7 +8400,7 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    ds_add_f32 v2, v1
-; GFX8-NEXT:  .LBB29_4:
+; GFX8-NEXT:  ; %bb.4:
 ; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
index d156a0aef6c17e7..2e7692d5ac7746b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-intervals.mir
@@ -49,7 +49,7 @@ body:             |
 
     %2:vgpr_32 = COPY killed $vgpr0
     %6:sreg_32 = V_CMP_NE_U32_e64 0, killed %2, implicit $exec
-    %0:sreg_32 = SI_IF killed %6, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_32 = SI_IF killed %6, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.1:
@@ -63,7 +63,7 @@ body:             |
   bb.3:
     successors: %bb.4(0x40000000), %bb.1(0x40000000)
 
-    %1:sreg_32 = SI_ELSE killed %0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %1:sreg_32 = SI_ELSE killed %0, 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
@@ -166,14 +166,14 @@ body:             |
 
     %2:vgpr_32 = COPY killed $vgpr0
     %5:sreg_32 = nofpexcept V_CMP_NGT_F32_e64 0, 0, 0, %2, 0, implicit $mode, implicit $exec
-    %0:sreg_32 = SI_IF killed %5, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_32 = SI_IF killed %5, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
     successors: %bb.2(0x40000000), %bb.3(0x40000000)
 
     %7:sreg_32 = nofpexcept V_CMP_NLT_F32_e64 0, 0, 0, killed %2, 0, implicit $mode, implicit $exec
-    %1:sreg_32 = SI_IF killed %7, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %1:sreg_32 = SI_IF killed %7, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
@@ -271,7 +271,7 @@ body:             |
     %8:vgpr_32 = COPY killed $vgpr0
     %10:sreg_32 = S_MOV_B32 0
     %11:sreg_32 = V_CMP_NE_U32_e64 0, %8, implicit $exec
-    %0:sreg_32 = SI_IF killed %11, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %0:sreg_32 = SI_IF killed %11, 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -327,7 +327,7 @@ body:             |
     %19:sreg_32 = S_MOV_B32 0
     %20:vgpr_32 = V_MBCNT_LO_U32_B32_e64 killed %13, 0, implicit $exec
     %21:sreg_32 = V_CMP_EQ_U32_e64 0, killed %20, implicit $exec
-    %7:sreg_32 = SI_IF killed %21, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %7:sreg_32 = SI_IF killed %21, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
index 02e3d7e81fd4051..c3e9f3c607d101b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.mir
@@ -64,7 +64,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed %0, implicit $exec
-    %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.1:
@@ -81,7 +81,7 @@ body:             |
     %9:vgpr_32 = PHI %8, %bb.1, %7, %bb.2, %1, %bb.0
     GLOBAL_STORE_DWORD undef %10:vreg_64, %9, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     %7:vgpr_32 = COPY killed %9
-    %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
 ...
@@ -151,7 +151,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed %0, implicit $exec
-    %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.1:
@@ -174,7 +174,7 @@ body:             |
     %10:vgpr_32 = PHI %9, %bb.2, %7, %bb.3, %1, %bb.0
     GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     %7:vgpr_32 = COPY killed %10
-    %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
 ...
@@ -239,7 +239,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed %0, implicit $exec
-    %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.1:
@@ -256,7 +256,7 @@ body:             |
     %10:vgpr_32 = PHI %9, %bb.1, %7, %bb.2, %1, %bb.0
     GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     %7:vgpr_32 = COPY killed %10
-    %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir
index f4e26aeae67666e..5062c3ef1de27d2 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-live-variables-update.xfail.mir
@@ -19,7 +19,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, killed %0, implicit $exec
-    %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.1:
@@ -36,7 +36,7 @@ body:             |
     %10:vgpr_32 = PHI %9, %bb.2, %7, %bb.3, %1, %bb.0
     GLOBAL_STORE_DWORD undef %11:vreg_64, %10, 0, 0, implicit $exec :: (volatile store (s32), addrspace 1)
     %7:vgpr_32 = COPY killed %10
-    %5:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
index eaf398fd517239d..e458057f8c21a67 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-control-flow-other-terminators.mir
@@ -44,7 +44,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:sreg_64_xexec = COPY $sgpr4_sgpr5
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec
-    %3:sreg_64_xexec = SI_IF %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_IF %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec
     S_BRANCH %bb.2
 
@@ -90,7 +90,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:sreg_64_xexec = COPY $sgpr4_sgpr5
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec
-    %3:sreg_64_xexec = SI_IF %2, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_IF %2, 0, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec
 
   bb.1:
@@ -134,7 +134,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:sreg_64_xexec = COPY $sgpr4_sgpr5
     %2:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %0, implicit $exec
-    %3:sreg_64_xexec = SI_ELSE %2, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_64_xexec = SI_ELSE %2, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %4:sreg_64_xexec = S_MOV_B64_term killed %1, implicit $exec
     S_BRANCH %bb.2
 
@@ -250,7 +250,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
     %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
-    %10:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %10:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %14:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec
     %13:sreg_64_xexec = S_MOV_B64_term %10, implicit $exec
     S_BRANCH %bb.2
@@ -264,7 +264,7 @@ body:             |
     %12:sreg_64_xexec = COPY %14
     SI_END_CF killed %12, implicit-def $exec, implicit-def dead $scc, implicit $exec
     S_SLEEP 1
-    %9:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %9:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %14:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec
     %13:sreg_64_xexec = S_MOV_B64_term %9, implicit $exec
     S_BRANCH %bb.2
diff --git a/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir b/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir
index c5e2ba5d8c7cba3..a53f1cd4372575f 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir
+++ b/llvm/test/CodeGen/AMDGPU/lower-i1-copies-clear-kills.mir
@@ -33,7 +33,7 @@ body:             |
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %16, %bb.5
   ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY5]], %bb.0, %18, %bb.5
   ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY4]]
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY6]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY6]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -53,7 +53,7 @@ body:             |
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[PHI3]]
   ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
-  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[COPY7]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[COPY7]], 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.4
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
@@ -105,7 +105,7 @@ body:             |
     %15:sreg_32 = PHI %2, %bb.0, %16, %bb.5
     %17:vgpr_32 = PHI %12, %bb.0, %18, %bb.5
     %19:sreg_32 = COPY %11
-    %20:sreg_32 = SI_IF %19, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %20:sreg_32 = SI_IF %19, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
@@ -126,7 +126,7 @@ body:             |
     %28:sreg_32 = COPY %25
     %29:vgpr_32 = COPY %27
     %30:vreg_1 = COPY %26, implicit $exec
-    %31:sreg_32 = SI_IF %28, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %31:sreg_32 = SI_IF %28, 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
index 0fc31ea9d643791..a9e145616373dac 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-ignorable-exec-use.mir
@@ -28,7 +28,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.1
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.1:
@@ -70,7 +70,7 @@ body:             |
     %16:sreg_32 = S_MOV_B32 1
     %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
     %18:sreg_64 = COPY %17
-    %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %19:sreg_64 = SI_IF %18, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -114,7 +114,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.1
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.1:
@@ -153,7 +153,7 @@ body:             |
     %16:sreg_32 = S_MOV_B32 1
     %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
     %18:sreg_64 = COPY %17
-    %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %19:sreg_64 = SI_IF %18, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -199,7 +199,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.1
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.1:
@@ -238,7 +238,7 @@ body:             |
     %16:sreg_32 = S_MOV_B32 1
     %17:sreg_64 = V_CMP_LT_I32_e64 %15(s32), %16, implicit $exec
     %18:sreg_64 = COPY %17
-    %19:sreg_64 = SI_IF %18, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %19:sreg_64 = SI_IF %18, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -275,7 +275,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY3]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.1
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.1:
@@ -304,7 +304,7 @@ body:             |
     %23:sreg_32 = S_MOV_B32 1
     %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
     %0:sreg_64 = COPY %24
-    %5:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64 = SI_IF %0, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -342,7 +342,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.2
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
@@ -406,7 +406,7 @@ body:             |
     %23:sreg_32 = S_MOV_B32 1
     %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
     %0:sreg_64 = COPY %24
-    %5:sreg_64 = SI_IF %0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64 = SI_IF %0, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
@@ -475,7 +475,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.2
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.2:
@@ -537,7 +537,7 @@ body:             |
     %23:sreg_32 = S_MOV_B32 1
     %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
     %0:sreg_64 = COPY %24
-    %5:sreg_64 = SI_IF %0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64 = SI_IF %0, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
@@ -614,7 +614,7 @@ body:             |
   ; GFX9-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0
   ; GFX9-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 1
   ; GFX9-NEXT:   [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 [[COPY4]](s32), [[S_MOV_B32_]], implicit $exec
-  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX9-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF [[V_CMP_LT_I32_e64_]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX9-NEXT:   S_BRANCH %bb.3
   ; GFX9-NEXT: {{  $}}
   ; GFX9-NEXT: bb.3:
@@ -689,7 +689,7 @@ body:             |
     %23:sreg_32 = S_MOV_B32 1
     %24:sreg_64 = V_CMP_LT_I32_e64 %16(s32), %23, implicit $exec
     %0:sreg_64 = COPY %24
-    %5:sreg_64 = SI_IF %0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %5:sreg_64 = SI_IF %0, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
index ef6771278b06f35..35be7f02ff3e699 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-lane-mask.mir
@@ -37,7 +37,7 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI3]], $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32 = S_OR_B32 [[PHI2]], $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_LT_I32_e64_]], 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GFX10-NEXT:   S_BRANCH %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -104,7 +104,7 @@ body: |
     %36:vgpr_32 = COPY %26
     %39:sreg_32 = S_OR_B32 %42, $exec_lo, implicit-def $scc
     %45:sreg_32 = S_OR_B32 %48, $exec_lo, implicit-def $scc
-    %4:sreg_32 = SI_IF killed %27, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_32 = SI_IF killed %27, 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
index 037a285794120da..3d20d91a6eb5fe0 100644
--- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
+++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.mir
@@ -24,13 +24,13 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -77,11 +77,11 @@ body:             |
     %2:vgpr_32 = COPY $vgpr1
 
   bb.1:
-    %3:sreg_32 = SI_IF %1, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_32 = SI_IF %1, 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
-    %4:sreg_32 = SI_IF %1, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_32 = SI_IF %1, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
@@ -135,13 +135,13 @@ body:             |
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.6(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32 = SI_IF [[COPY1]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -188,11 +188,11 @@ body:             |
     %2:sreg_32 = COPY $sgpr10
 
   bb.1:
-    %3:sreg_32 = SI_IF %1, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %3:sreg_32 = SI_IF %1, 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
-    %4:sreg_32 = SI_IF %1, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_32 = SI_IF %1, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
index 2e6a73bb2cc00fb..afe701f1f4b6e77 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir
@@ -14,7 +14,7 @@
     %ptr = load volatile ptr addrspace(1), ptr addrspace(1) %tid.gep
     %xor = xor i32 %tid, 1
     %cmp = icmp ne i32 %xor, 0
-    %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %cmp)
+    %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %cmp, i1 0)
     %2 = extractvalue { i1, i64 } %1, 0
     %3 = extractvalue { i1, i64 } %1, 1
     br i1 %2, label %atomic, label %exit
@@ -29,7 +29,7 @@
     ret void
   }
 
-  declare { i1, i64 } @llvm.amdgcn.if(i1)
+  declare { i1, i64 } @llvm.amdgcn.if(i1, i1)
 
   declare void @llvm.amdgcn.end.cf(i64)
 
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
index 18df16988d8e4d2..f1b314373ace6f9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir
@@ -29,10 +29,10 @@
   }
 
   ; Function Attrs: convergent nounwind
-  declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+  declare { i1, i64 } @llvm.amdgcn.if(i1, i1) #1
 
   ; Function Attrs: convergent nounwind
-  declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+  declare { i1, i64 } @llvm.amdgcn.else(i64, i1) #1
 
   ; Function Attrs: convergent nounwind readnone
   declare i64 @llvm.amdgcn.break(i64) #2
diff --git a/llvm/test/CodeGen/AMDGPU/mmra.ll b/llvm/test/CodeGen/AMDGPU/mmra.ll
index 39650f4295c76eb..f5a3799c65b5f5f 100644
--- a/llvm/test/CodeGen/AMDGPU/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/mmra.ll
@@ -148,7 +148,7 @@ define void @cmpxchg(ptr %ptr) {
   ; CHECK-NEXT:   [[DEF7:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[DEF7]]
   ; CHECK-NEXT:   [[S_OR_B64_:%[0-9]+]]:sreg_64 = S_OR_B64 [[PHI]], $exec, implicit-def $scc
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2.partword.cmpxchg.failure:
diff --git a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
index 10d08032bf59a5d..aebb060d7e208b2 100644
--- a/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
+++ b/llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -10,7 +10,7 @@
 ; StructurizeCFG.
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_ret(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot, i1 false)
 ; IR: %1 = extractvalue { i1, i64 } %0, 0
 ; IR: %2 = extractvalue { i1, i64 } %0, 1
 ; IR: br i1 %1, label %LeafBlock1, label %Flow
@@ -18,7 +18,7 @@
 ; IR: Flow:
 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2, i1 false)
 ; IR: %6 = extractvalue { i1, i64 } %5, 0
 ; IR: %7 = extractvalue { i1, i64 } %5, 1
 ; IR: br i1 %6, label %LeafBlock, label %Flow1
@@ -32,7 +32,7 @@
 ; IR:  Flow2:
 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8, i1 false)
 ; IR: %10 = extractvalue { i1, i64 } %9, 0
 ; IR: %11 = extractvalue { i1, i64 } %9, 1
 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
@@ -45,7 +45,7 @@
 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
-; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13, i1 false)
 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 ; IR: br i1 %15, label %exit1, label %Flow2
@@ -140,13 +140,13 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot, i1 false)
 
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2, i1 false)
 
 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8, i1 false)
 ; IR: br i1 %10, label %exit0, label %UnifiedUnreachableBlock
 
 
@@ -202,7 +202,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: {{^}}Flow:
 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 ; IR: %4 = phi i1 [ %uniform.cond0, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2, i1 false)
 ; IR: br i1 %6, label %LeafBlock, label %Flow1
 
 ; IR: {{^}}LeafBlock:
@@ -216,7 +216,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: Flow2:
 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8, i1 false)
 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
@@ -227,7 +227,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: %12 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %3, %Flow ]
 ; IR: %13 = phi i1 [ %divergent.cond1.inv, %LeafBlock ], [ %4, %Flow ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
-; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13, i1 false)
 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 ; IR: br i1 %15, label %exit1, label %Flow2
@@ -276,17 +276,17 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot, i1 false)
 ; IR: br i1 %1, label %LeafBlock1, label %Flow
 
 ; IR: Flow:
 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2, i1 false)
 
 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8, i1 false)
 
 define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(ptr addrspace(1) nocapture %arg0, ptr addrspace(1) nocapture %arg1, ptr addrspace(1) nocapture %arg2, i32 %arg3) #0 {
 entry:
@@ -399,17 +399,17 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 }
 
 ; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
-; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot)
+; IR: %0 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %Pivot, i1 false)
 
 ; IR: Flow:
 ; IR: %3 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
 ; IR: %4 = phi i1 [ %SwitchLeaf2, %LeafBlock1 ], [ false, %entry ]
-; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2)
+; IR: %5 = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 %2, i1 false)
 
 ; IR: Flow2:
 ; IR: %8 = phi i1 [ false, %exit1 ], [ %12, %Flow1 ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %16)
-; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8)
+; IR: %9 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %8, i1 false)
 ; IR: br i1 %10, label %exit0, label %UnifiedReturnBlock
 
 ; IR: exit0:
@@ -420,7 +420,7 @@ exit1:                                     ; preds = %LeafBlock, %LeafBlock1
 ; IR: %12 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %3, %Flow ]
 ; IR: %13 = phi i1 [ %SwitchLeaf.inv, %LeafBlock ], [ %4, %Flow ]
 ; IR: call void @llvm.amdgcn.end.cf.i64(i64 %7)
-; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13)
+; IR: %14 = call { i1, i64 } @llvm.amdgcn.if.i64(i1 %13, i1 false)
 ; IR: %15 = extractvalue { i1, i64 } %14, 0
 ; IR: %16 = extractvalue { i1, i64 } %14, 1
 ; IR: br i1 %15, label %exit1, label %Flow2
diff --git a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
index 6c62f3f225cd9b7..7377f020b977aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/llvm/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -16,7 +16,7 @@ define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
 ; OPT-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], [[FLOW:%.*]] ], [ 0, [[LOOP_OUTER]] ]
 ; OPT-NEXT:    [[TMP45:%.*]] = phi i32 [ [[TMP43]], [[LOOP_OUTER]] ], [ [[TMP3]], [[FLOW]] ]
 ; OPT-NEXT:    [[TMP48:%.*]] = icmp slt i32 [[TMP45]], [[UB:%.*]]
-; OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]])
+; OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP48]], i1 false)
 ; OPT-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
 ; OPT-NEXT:    br i1 [[TMP1]], label [[ENDIF:%.*]], label [[FLOW]]
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index bd6ef9e088b12fe..24bc8981c4768f3 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -61,7 +61,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; IR-NEXT:    [[PHI_BROKEN:%.*]] = phi i64 [ [[TMP6:%.*]], %[[BB10:.*]] ], [ 0, %[[BB]] ]
 ; IR-NEXT:    [[MY_TMP6:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP5:%.*]], %[[BB10]] ]
 ; IR-NEXT:    [[MY_TMP7:%.*]] = icmp eq i32 [[MY_TMP6]], 1
-; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP7]])
+; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP7]], i1 false)
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
 ; IR-NEXT:    br i1 [[TMP1]], label %[[BB8:.*]], label %[[FLOW]]
@@ -205,7 +205,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    br label %[[BB14:.*]]
 ; IR:       [[FLOW3:.*]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP20:%.*]])
-; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]])
+; IR-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP14:%.*]], i1 false)
 ; IR-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; IR-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
 ; IR-NEXT:    br i1 [[TMP1]], label %[[BB4_BB13_CRIT_EDGE:.*]], label %[[FLOW4:.*]]
@@ -219,7 +219,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    br label %[[BB31:.*]]
 ; IR:       [[FLOW]]:
 ; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP3]], %[[FLOW4]] ], [ true, %[[BB]] ]
-; IR-NEXT:    [[TMP5:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP4]])
+; IR-NEXT:    [[TMP5:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP4]], i1 false)
 ; IR-NEXT:    [[TMP6:%.*]] = extractvalue { i1, i64 } [[TMP5]], 0
 ; IR-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP5]], 1
 ; IR-NEXT:    br i1 [[TMP6]], label %[[BB13]], label %[[BB31]]
@@ -228,7 +228,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    [[MY_TMP1037:%.*]] = phi i32 [ [[MY_TMP1033]], %[[BB14_LR_PH]] ], [ [[TMP12:%.*]], %[[FLOW1]] ]
 ; IR-NEXT:    [[MY_TMP936:%.*]] = phi <4 x i32> [ [[MY_TMP932]], %[[BB14_LR_PH]] ], [ [[TMP11:%.*]], %[[FLOW1]] ]
 ; IR-NEXT:    [[MY_TMP15:%.*]] = icmp eq i32 [[MY_TMP1037]], 1
-; IR-NEXT:    [[TMP8:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP15]])
+; IR-NEXT:    [[TMP8:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[MY_TMP15]], i1 false)
 ; IR-NEXT:    [[TMP9:%.*]] = extractvalue { i1, i64 } [[TMP8]], 0
 ; IR-NEXT:    [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP8]], 1
 ; IR-NEXT:    br i1 [[TMP9]], label %[[BB16:.*]], label %[[FLOW1]]
@@ -268,7 +268,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    br label %[[FLOW1]]
 ; IR:       [[FLOW2]]:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP16]])
-; IR-NEXT:    [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
+; IR-NEXT:    [[TMP18:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]], i1 false)
 ; IR-NEXT:    [[TMP19:%.*]] = extractvalue { i1, i64 } [[TMP18]], 0
 ; IR-NEXT:    [[TMP20]] = extractvalue { i1, i64 } [[TMP18]], 1
 ; IR-NEXT:    br i1 [[TMP19]], label %[[BB31_LOOPEXIT:.*]], label %[[FLOW3]]
diff --git a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
index 748775dc2cf1d57..5f208f0c36446ea 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir
@@ -102,7 +102,7 @@ body:             |
   ; GCN-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 1048576
   ; GCN-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[S_MOV_B32_1]], %subreg.sub0, killed [[S_MOV_B32_]], %subreg.sub1
   ; GCN-NEXT:   [[V_CMP_LT_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], implicit $exec
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U64_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U64_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1.bb1:
@@ -143,7 +143,7 @@ body:             |
     %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1
     %22 = COPY killed %20
     %21 = V_CMP_LT_U64_e64 killed %17, %22, implicit $exec
-    %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %1 = SI_IF killed %21, 0, %bb.2.bb2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1.bb1
 
   bb.1.bb1:
@@ -243,7 +243,7 @@ body:             |
   ; GCN-NEXT:   [[V_CMP_LT_U64_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], implicit $exec
   ; GCN-NEXT:   [[V_CMP_LT_U64_e64_1:%[0-9]+]]:sreg_64 = V_CMP_LT_U64_e64 killed [[REG_SEQUENCE2]], [[REG_SEQUENCE3]], implicit $exec
   ; GCN-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 killed [[V_CMP_LT_U64_e64_]], killed [[V_CMP_LT_U64_e64_1]], implicit-def dead $scc
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[S_AND_B64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[S_AND_B64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1.bb1:
@@ -293,7 +293,7 @@ body:             |
     %27 = V_CMP_LT_U64_e64 killed %18, %28, implicit $exec
     %29 = V_CMP_LT_U64_e64 killed %23, %28, implicit $exec
     %31 = S_AND_B64 killed %27, killed %29, implicit-def dead $scc
-    %1 = SI_IF killed %31, %bb.2.bb2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %1 = SI_IF killed %31, 0, %bb.2.bb2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1.bb1
 
   bb.1.bb1:
@@ -375,7 +375,7 @@ body:             |
   ; GCN-NEXT:   [[V_ADDC_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADDC_U32_e32 0, [[COPY3]], implicit-def $vcc, implicit $vcc, implicit $exec
   ; GCN-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e32_]], %subreg.sub0, killed [[V_ADDC_U32_e32_]], %subreg.sub1
   ; GCN-NEXT:   [[V_CMP_LT_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_U32_e64 killed [[REG_SEQUENCE1]].sub0, 12, implicit $exec
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_LT_U32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1.bb1:
@@ -416,7 +416,7 @@ body:             |
     %20 = REG_SEQUENCE killed %19, %subreg.sub0, killed %18, %subreg.sub1
     %22 = COPY killed %20.sub1
     %21 = V_CMP_LT_U32_e64 killed %17.sub0, %22, implicit $exec
-    %1 = SI_IF killed %21, %bb.2.bb2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %1 = SI_IF killed %21, 0, %bb.2.bb2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1.bb1
 
   bb.1.bb1:
diff --git a/llvm/test/CodeGen/AMDGPU/opt-vgpr-live-range-verifier-error.mir b/llvm/test/CodeGen/AMDGPU/opt-vgpr-live-range-verifier-error.mir
index 929556252943895..9a32c4261a41941 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-vgpr-live-range-verifier-error.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-vgpr-live-range-verifier-error.mir
@@ -23,20 +23,20 @@ body:             |
     successors: %bb.2(0x40000000), %bb.4(0x40000000)
 
     %5:sreg_64 = V_CMP_LT_I32_e64 0, %4, implicit $exec
-    %6:sreg_64 = SI_IF killed %5, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %6:sreg_64 = SI_IF killed %5, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.2:
     successors: %bb.4(0x40000000), %bb.3(0x40000000)
 
-    dead %7:sreg_64 = SI_ELSE killed %6, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    dead %7:sreg_64 = SI_ELSE killed %6, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.3:
     successors: %bb.4(0x80000000)
 
     %8:sreg_64 = V_CMP_EQ_U32_e64 0, killed %4, implicit $exec
-    dead %9:sreg_64 = SI_IF killed %8, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    dead %9:sreg_64 = SI_IF killed %8, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.4:
diff --git a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
index 829d01d8e1c3657..3d5a264a3dd7708 100644
--- a/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
+++ b/llvm/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir
@@ -4,7 +4,7 @@
 # CHECK-LABEL:  phi-cf-test
 # CHECK: bb.0:
 # CHECK:     [[COND:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64
-# CHECK:     [[IF_SOURCE0:%[0-9]+]]:sreg_64 = SI_IF [[COND]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+# CHECK:     [[IF_SOURCE0:%[0-9]+]]:sreg_64 = SI_IF [[COND]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 # CHECK:     [[IF_INPUT_REG:%[0-9]+]]:sreg_64 = S_MOV_B64_term killed [[IF_SOURCE0]], implicit $exec
 
 # CHECK: bb.1:
@@ -12,7 +12,7 @@
 # CHECK:     SI_END_CF killed [[END_CF_ARG]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 
 # CHECK: bb.2:
-# CHECK:     [[IF_SOURCE1:%[0-9]+]]:sreg_64 = SI_IF [[COND]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+# CHECK:     [[IF_SOURCE1:%[0-9]+]]:sreg_64 = SI_IF [[COND]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
 # CHECK:     [[IF_INPUT_REG]]:sreg_64 = S_MOV_B64_term killed [[IF_SOURCE1]], implicit $exec
 
 
@@ -29,7 +29,7 @@ body:             |
     %5:vgpr_32(s32) = COPY $vgpr0
     %0:sreg_64 = V_CMP_EQ_U32_e64 0, %5(s32), implicit $exec
     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    %22:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %22:sreg_64 = SI_IF %0, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
   bb.2:
@@ -49,7 +49,7 @@ body:             |
     %17:sgpr_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3
     BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, implicit $exec :: (volatile store (s32) into `ptr addrspace(1) undef`, addrspace 1)
     %19:vgpr_32 = COPY %4
-    %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %20:sreg_64 = SI_IF %0, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
index 15f6bb632f3113e..c5cedeb0d374b44 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
@@ -7,13 +7,13 @@ define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 {
 ; OPT-SAME: i32 [[TMP0:%.*]]) !dbg [[DBG5:![0-9]+]] {
 ; OPT-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP0]], 0, !dbg [[DBG13:![0-9]+]]
 ; OPT-NEXT:      #dbg_value(i1 [[C]], [[META9:![0-9]+]], !DIExpression(), [[DBG13]])
-; OPT-NEXT:    [[TMP2:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[C]]), !dbg [[DBG14:![0-9]+]]
+; OPT-NEXT:    [[TMP2:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[C]], i1 false), !dbg [[DBG14:![0-9]+]]
 ; OPT-NEXT:    [[TMP3:%.*]] = extractvalue { i1, i64 } [[TMP2]], 0, !dbg [[DBG14]]
 ; OPT-NEXT:    [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP2]], 1, !dbg [[DBG14]]
 ; OPT-NEXT:    br i1 [[TMP3]], label [[FALSE:%.*]], label [[FLOW:%.*]], !dbg [[DBG14]]
 ; OPT:       Flow:
 ; OPT-NEXT:    [[TMP5:%.*]] = phi i32 [ 33, [[FALSE]] ], [ undef, [[TMP1:%.*]] ]
-; OPT-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP4]]), !dbg [[DBG14]]
+; OPT-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP4]], i1 false), !dbg [[DBG14]]
 ; OPT-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0, !dbg [[DBG14]]
 ; OPT-NEXT:    [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1, !dbg [[DBG14]]
 ; OPT-NEXT:    br i1 [[TMP7]], label [[TRUE:%.*]], label [[EXIT:%.*]], !dbg [[DBG14]]
@@ -54,7 +54,7 @@ define amdgpu_ps void @loop_if_break(i32 %n) !dbg !19 {
 ; OPT-NEXT:      #dbg_value(i32 [[I]], [[META21:![0-9]+]], !DIExpression(), [[DBG25]])
 ; OPT-NEXT:    [[C:%.*]] = icmp ugt i32 [[I]], 0, !dbg [[DBG26:![0-9]+]]
 ; OPT-NEXT:      #dbg_value(i1 [[C]], [[META22:![0-9]+]], !DIExpression(), [[DBG26]])
-; OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[C]]), !dbg [[DBG27:![0-9]+]]
+; OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[C]], i1 false), !dbg [[DBG27:![0-9]+]]
 ; OPT-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0, !dbg [[DBG27]]
 ; OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1, !dbg [[DBG27]]
 ; OPT-NEXT:    br i1 [[TMP1]], label [[LOOP_BODY:%.*]], label [[FLOW]], !dbg [[DBG27]]
diff --git a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
index f1b88c761629850..1b7d5d320c8f33b 100644
--- a/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
@@ -27,7 +27,7 @@ body: |
     %0 = PHI %8, %bb.0, %0, %bb.1, %2, %bb.2
     %9 = V_MOV_B32_e32 9, implicit $exec
     %10 = V_CMP_EQ_U32_e64 %7, %9, implicit $exec
-    %1 = SI_IF %10, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    %1 = SI_IF %10, 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
index eddad05d976bd3e..bcdd7ce406c2cc8 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-control-flow.mir
@@ -43,7 +43,7 @@ body:             |
   bb.0:
     successors: %bb.1, %bb.2
 
-    %1:sreg_64 = SI_IF undef %0:sreg_64, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %1:sreg_64 = SI_IF undef %0:sreg_64, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
@@ -89,7 +89,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
     %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
-    %4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
     S_BRANCH %bb.2
 
@@ -147,7 +147,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
     %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
-    %4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
     S_BRANCH %bb.2
 
@@ -210,7 +210,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
     %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
-    %4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
     S_BRANCH %bb.2
 
@@ -271,7 +271,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
     %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
-    %4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
     S_BRANCH %bb.2
 
@@ -334,7 +334,7 @@ body:             |
     %0:vgpr_32 = COPY killed $vgpr0
     %1:vgpr_32 = COPY killed $vgpr1
     %3:sreg_64_xexec = V_CMP_EQ_U32_e64 killed %0, killed %1, implicit $exec
-    %4:sreg_64_xexec = SI_IF %3, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %4:sreg_64_xexec = SI_IF %3, 0, %bb.1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %5:sreg_64_xexec = S_MOV_B64_term %4, implicit $exec
     S_BRANCH %bb.2
 
@@ -403,7 +403,7 @@ body:             |
 
     %6:sreg_64_xexec = COPY %3
     SI_END_CF killed %6, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    %7:sreg_64_xexec = SI_IF %4, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec
+    %7:sreg_64_xexec = SI_IF %4, 0, %bb.2, implicit-def $exec, implicit-def dead $scc, implicit $exec
     %8:sreg_64_xexec = S_MOV_B64_term %7, implicit $exec
 
   bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir
index ecbd47a9e8d0ddb..2ab9c9adfa1719b 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies-order-of-phi-incomings.mir
@@ -49,7 +49,7 @@ body: |
   ; GCN-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32 = S_ANDN2_B32 [[PHI]], $exec_lo, implicit-def $scc
   ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[PHI1]], $exec_lo, implicit-def $scc
   ; GCN-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32 = S_OR_B32 [[S_ANDN2_B32_]], [[S_AND_B32_]], implicit-def $scc
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY7]], %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[COPY7]], 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
@@ -114,7 +114,7 @@ body: |
     %17:vreg_64 = PHI %13, %bb.0, %18, %bb.3
     %19:vreg_1 = PHI %14, %bb.0, %20, %bb.3
     %21:sreg_32 = COPY %19
-    %22:sreg_32 = SI_IF %21, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %22:sreg_32 = SI_IF %21, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
 
   bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir
index 502116b121d949c..f0a3af96e551580 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-i1-copies.mir
@@ -10,7 +10,7 @@ body:             |
     %0:sreg_64 = S_MOV_B64 0
     %8:vreg_1 = IMPLICIT_DEF
     %10:sreg_64 = IMPLICIT_DEF
-    %11:sreg_64 = SI_IF %10, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %11:sreg_64 = SI_IF %10, 0, %bb.3, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
   bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir
index 93796b3049b5b89..f3f7fe05a40d540 100644
--- a/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-opt-vgpr-liverange-bug-deadlanes.mir
@@ -36,7 +36,7 @@ body:             |
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN killed [[COPY]], undef %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 0, [[BUFFER_LOAD_DWORD_OFFEN]], implicit $exec
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[V_CMP_NE_U32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -49,7 +49,7 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vreg_128 = PHI undef %10:vreg_128, %bb.0, [[REG_SEQUENCE]], %bb.1
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[BUFFER_LOAD_DWORD_OFFEN]], %bb.0, undef %15:vgpr_32, %bb.1
-  ; CHECK-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
@@ -70,7 +70,7 @@ body:             |
     %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %1, undef %6:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load (s32))
     %4:sreg_64 = V_CMP_NE_U32_e64 0, %3, implicit $exec
-    %8:sreg_64 = SI_IF killed %4, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %8:sreg_64 = SI_IF killed %4, 0, %bb.6, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.5
 
   bb.5:
@@ -82,7 +82,7 @@ body:             |
     successors: %bb.7(0x40000000), %bb.8(0x40000000)
 
     %10:vreg_128 = PHI undef %156:vreg_128, %bb.0, %9, %bb.5
-    %11:sreg_64 = SI_ELSE killed %8, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %11:sreg_64 = SI_ELSE killed %8, 0, %bb.8, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.7
 
   bb.7:
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir
index 7c7b930b6c31812..553cf03b26090ce 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.mir
@@ -39,14 +39,14 @@ body:             |
   ; GCN-NEXT:   [[S_XOR_B64_:%[0-9]+]]:sreg_64 = S_XOR_B64 killed [[V_CMP_EQ_U32_e64_]], -1, implicit-def dead $scc
   ; GCN-NEXT:   [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; GCN-NEXT:   [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 killed [[V_MOV_B]], 0, 0, implicit $exec
-  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[S_XOR_B64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64 = SI_IF killed [[S_XOR_B64_]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[PHI:%[0-9]+]]:vreg_64 = PHI [[GLOBAL_LOAD_DWORDX2_]], %bb.0, undef %13:vreg_64, %bb.3
-  ; GCN-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; GCN-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_64 = SI_ELSE killed [[SI_IF]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
@@ -80,13 +80,13 @@ body:             |
     %3:sreg_64 = S_XOR_B64 killed %2, -1, implicit-def dead $scc
     %4:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
     %5:vreg_64 = GLOBAL_LOAD_DWORDX2 killed %4, 0, 0, implicit $exec
-    %6:sreg_64 = SI_IF killed %3, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %6:sreg_64 = SI_IF killed %3, 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.3
   
   bb.1:
     successors: %bb.2(0x40000000), %bb.4(0x40000000)
   
-    %7:sreg_64 = SI_ELSE %6, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %7:sreg_64 = SI_ELSE %6, 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.2
   
   bb.2:
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 2dfb72a08cffc52..22193fe4cf35d26 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -33,8 +33,8 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:  .LBB0_5: ; %Flow2
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
-; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
-; CHECK-NEXT:    s_branch .LBB0_7
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_7
+; CHECK-NEXT:    s_branch .LBB0_8
 ; CHECK-NEXT:  .LBB0_6:
 ; CHECK-NEXT:    s_mov_b64 s[2:3], 0
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
@@ -64,8 +64,8 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], exec
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
-; CHECK-NEXT:    s_cbranch_execnz .LBB0_9
-; CHECK-NEXT:    s_branch .LBB0_10
+; CHECK-NEXT:    s_cbranch_execz .LBB0_10
+; CHECK-NEXT:    s_branch .LBB0_9
 ; CHECK-NEXT:  .LBB0_14: ; %cond.false.i8
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
index f232275c998d236..ac54a62060f1f87 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll
@@ -29,7 +29,7 @@ define void @my_func(i32 %0) {
 ; IR:       Flow11:
 ; IR-NEXT:    [[TMP4:%.*]] = phi i1 [ [[TMP9:%.*]], [[FLOW12]] ], [ false, [[FLOW]] ]
 ; IR-NEXT:    [[TMP5:%.*]] = phi i1 [ [[TMP10:%.*]], [[FLOW12]] ], [ [[TMP2]], [[FLOW]] ]
-; IR-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]])
+; IR-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]], i1 false)
 ; IR-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0
 ; IR-NEXT:    [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1
 ; IR-NEXT:    br i1 [[TMP7]], label [[DO_BODY:%.*]], label [[FLOW17:%.*]]
@@ -41,7 +41,7 @@ define void @my_func(i32 %0) {
 ; IR-NEXT:    br label [[FLOW11]]
 ; IR:       NodeBlock7:
 ; IR-NEXT:    [[PIVOT8:%.*]] = icmp sge i32 [[TMP0:%.*]], 2
-; IR-NEXT:    [[TMP11:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[PIVOT8]])
+; IR-NEXT:    [[TMP11:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[PIVOT8]], i1 false)
 ; IR-NEXT:    [[TMP12:%.*]] = extractvalue { i1, i64 } [[TMP11]], 0
 ; IR-NEXT:    [[TMP13:%.*]] = extractvalue { i1, i64 } [[TMP11]], 1
 ; IR-NEXT:    br i1 [[TMP12]], label [[LEAFBLOCK5:%.*]], label [[FLOW13:%.*]]
@@ -51,7 +51,7 @@ define void @my_func(i32 %0) {
 ; IR:       Flow13:
 ; IR-NEXT:    [[TMP14:%.*]] = phi i1 [ true, [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ]
 ; IR-NEXT:    [[TMP15:%.*]] = phi i1 [ [[SWITCHLEAF6]], [[LEAFBLOCK5]] ], [ false, [[NODEBLOCK7]] ]
-; IR-NEXT:    [[TMP16:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP13]])
+; IR-NEXT:    [[TMP16:%.*]] = call { i1, i64 } @llvm.amdgcn.else.i64.i64(i64 [[TMP13]], i1 false)
 ; IR-NEXT:    [[TMP17:%.*]] = extractvalue { i1, i64 } [[TMP16]], 0
 ; IR-NEXT:    [[TMP18:%.*]] = extractvalue { i1, i64 } [[TMP16]], 1
 ; IR-NEXT:    br i1 [[TMP17]], label [[LEAFBLOCK3:%.*]], label [[FLOW14:%.*]]
@@ -63,7 +63,7 @@ define void @my_func(i32 %0) {
 ; IR-NEXT:    [[TMP19:%.*]] = phi i1 [ [[SWITCHLEAF4_INV]], [[LEAFBLOCK3]] ], [ [[TMP14]], [[FLOW13]] ]
 ; IR-NEXT:    [[TMP20:%.*]] = phi i1 [ [[SWITCHLEAF4]], [[LEAFBLOCK3]] ], [ [[TMP15]], [[FLOW13]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP18]])
-; IR-NEXT:    [[TMP21:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP20]])
+; IR-NEXT:    [[TMP21:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP20]], i1 false)
 ; IR-NEXT:    [[TMP22:%.*]] = extractvalue { i1, i64 } [[TMP21]], 0
 ; IR-NEXT:    [[TMP23:%.*]] = extractvalue { i1, i64 } [[TMP21]], 1
 ; IR-NEXT:    br i1 [[TMP22]], label [[LAND_LHS_TRUE_I:%.*]], label [[FLOW15]]
@@ -76,7 +76,7 @@ define void @my_func(i32 %0) {
 ; IR-NEXT:    br label [[FLOW12]]
 ; IR:       LeafBlock9:
 ; IR-NEXT:    [[SWITCHLEAF10:%.*]] = icmp sgt i32 [[TMP0]], 1
-; IR-NEXT:    [[TMP26:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[SWITCHLEAF10]])
+; IR-NEXT:    [[TMP26:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[SWITCHLEAF10]], i1 false)
 ; IR-NEXT:    [[TMP27:%.*]] = extractvalue { i1, i64 } [[TMP26]], 0
 ; IR-NEXT:    [[TMP28:%.*]] = extractvalue { i1, i64 } [[TMP26]], 1
 ; IR-NEXT:    br i1 [[TMP27]], label [[DO_BODY_I_I_I_I:%.*]], label [[FLOW16]]
@@ -94,7 +94,7 @@ define void @my_func(i32 %0) {
 ; IR:       Flow17:
 ; IR-NEXT:    [[TMP31:%.*]] = phi i1 [ true, [[DO_BODY]] ], [ [[TMP4]], [[FLOW11]] ]
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
-; IR-NEXT:    [[TMP32:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP31]])
+; IR-NEXT:    [[TMP32:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP31]], i1 false)
 ; IR-NEXT:    [[TMP33:%.*]] = extractvalue { i1, i64 } [[TMP32]], 0
 ; IR-NEXT:    [[TMP34:%.*]] = extractvalue { i1, i64 } [[TMP32]], 1
 ; IR-NEXT:    br i1 [[TMP33]], label [[UNIFIEDUNREACHABLEBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir
index 77032ffcf18a915..8c009dd3b15d1b4 100644
--- a/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir
+++ b/llvm/test/CodeGen/AMDGPU/stale-livevar-in-twoaddr-pass.mir
@@ -31,7 +31,7 @@ body:             |
     %5:vgpr_32 = V_MAC_F32_e32 0, %1, %0, implicit $mode, implicit $exec
     %7:vgpr_32 = V_CVT_I32_F32_e32 %5, implicit $mode, implicit $exec
     %8:sreg_64 = V_CMP_NE_U32_e64 1, %7, implicit $exec
-    %6:sreg_64 = SI_IF %8, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    %6:sreg_64 = SI_IF %8, 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
     S_BRANCH %bb.1
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
index be1a8aceb8c9032..3b3f873ab8e611c 100644
--- a/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
+++ b/llvm/test/CodeGen/AMDGPU/stop-tail-duplicate-cfg-intrinsic.mir
@@ -21,7 +21,7 @@ body: |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[S_MOV_B32_]], %bb.0, %6, %bb.3
-  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT:   [[SI_IF:%[0-9]+]]:sreg_64_xexec = SI_IF [[V_CMP_EQ_U32_e64_]], 0, %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -53,7 +53,7 @@ body: |
 
   bb.2:
     %6:vgpr_32 = PHI %4:sreg_32, %bb.1, %11:vgpr_32, %bb.4
-    %8:sreg_64_xexec = SI_IF %5:sreg_64_xexec, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    %8:sreg_64_xexec = SI_IF %5:sreg_64_xexec, 0, %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
     S_BRANCH %bb.4
 
   bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index d9df80ce6c1c04c..c64e4fdff86fc33 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -148,7 +148,7 @@ define hidden void @widget() {
 ; SI-OPT:       bb9:
 ; SI-OPT-NEXT:    [[TMP10:%.*]] = call float @wibble()
 ; SI-OPT-NEXT:    [[TMP11:%.*]] = fcmp nsz ogt float [[TMP10]], 0.000000e+00
-; SI-OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP11]])
+; SI-OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP11]], i1 false)
 ; SI-OPT-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; SI-OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
 ; SI-OPT-NEXT:    br i1 [[TMP1]], label [[BB6:%.*]], label [[BB9_BB12_CRIT_EDGE:%.*]]
@@ -206,7 +206,7 @@ define hidden void @blam() {
 ; SI-OPT-NEXT:    br label [[BB4:%.*]]
 ; SI-OPT:       bb4:
 ; SI-OPT-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[TMP3]], 3
-; SI-OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]])
+; SI-OPT-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP5]], i1 false)
 ; SI-OPT-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; SI-OPT-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
 ; SI-OPT-NEXT:    br i1 [[TMP1]], label [[BB8:%.*]], label [[BB6:%.*]]
@@ -216,7 +216,7 @@ define hidden void @blam() {
 ; SI-OPT-NEXT:    br i1 [[TMP7]], label [[BB11:%.*]], label [[BB1:%.*]]
 ; SI-OPT:       bb8:
 ; SI-OPT-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP3]], 1
-; SI-OPT-NEXT:    [[TMP3:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP9]])
+; SI-OPT-NEXT:    [[TMP3:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP9]], i1 false)
 ; SI-OPT-NEXT:    [[TMP4:%.*]] = extractvalue { i1, i64 } [[TMP3]], 0
 ; SI-OPT-NEXT:    [[TMP5:%.*]] = extractvalue { i1, i64 } [[TMP3]], 1
 ; SI-OPT-NEXT:    br i1 [[TMP4]], label [[BB10:%.*]], label [[BB8_BB1_CRIT_EDGE:%.*]]
@@ -229,14 +229,14 @@ define hidden void @blam() {
 ; SI-OPT:       bb11:
 ; SI-OPT-NEXT:    [[TMP12:%.*]] = call float @spam()
 ; SI-OPT-NEXT:    [[TMP13:%.*]] = fcmp nsz oeq float [[TMP12]], 0.000000e+00
-; SI-OPT-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP13]])
+; SI-OPT-NEXT:    [[TMP6:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP13]], i1 false)
 ; SI-OPT-NEXT:    [[TMP7:%.*]] = extractvalue { i1, i64 } [[TMP6]], 0
 ; SI-OPT-NEXT:    [[TMP8:%.*]] = extractvalue { i1, i64 } [[TMP6]], 1
 ; SI-OPT-NEXT:    br i1 [[TMP7]], label [[BB2]], label [[BB14:%.*]]
 ; SI-OPT:       bb14:
 ; SI-OPT-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP8]])
 ; SI-OPT-NEXT:    [[TMP15:%.*]] = fcmp nsz oeq float [[TMP]], 0.000000e+00
-; SI-OPT-NEXT:    [[TMP9:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]])
+; SI-OPT-NEXT:    [[TMP9:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[TMP15]], i1 false)
 ; SI-OPT-NEXT:    [[TMP10:%.*]] = extractvalue { i1, i64 } [[TMP9]], 0
 ; SI-OPT-NEXT:    [[TMP11:%.*]] = extractvalue { i1, i64 } [[TMP9]], 1
 ; SI-OPT-NEXT:    br i1 [[TMP10]], label [[BB17:%.*]], label [[BB16:%.*]]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 3a49c9b23f59eff..1b3aba2e2ba9ace 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -11,7 +11,7 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 {
   ; SI-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
   ; SI-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec
-  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.3
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.1.Flow:
@@ -19,7 +19,7 @@ define amdgpu_ps float @else1(i32 %z, float %v) #0 {
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %13:vgpr_32, %bb.0, %4, %bb.3
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, undef %15:vgpr_32, %bb.3
-  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.if:
@@ -67,7 +67,7 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 {
   ; SI-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
   ; SI-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY1]], implicit $exec
-  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.3
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.1.Flow:
@@ -75,7 +75,7 @@ define amdgpu_ps float @else2(i32 %z, float %v) #0 {
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %16:vgpr_32, %bb.0, %5, %bb.3
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %16:vgpr_32, %bb.0, [[COPY]], %bb.3
-  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], 0, %bb.4, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.if:
@@ -135,7 +135,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[S_MOV_B32_]], %bb.0, %14, %bb.5
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY]], %bb.0, %13, %bb.5
-  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_I32_e64_]], %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF [[V_CMP_GT_I32_e64_]], 0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.4
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.Flow:
@@ -144,7 +144,7 @@ define amdgpu_ps float @else3(i32 %z, float %v, i32 inreg %bound, i32 %x0) #0 {
   ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI undef %31:vgpr_32, %bb.1, %10, %bb.4
   ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI undef %32:vgpr_32, %bb.1, %9, %bb.4
   ; SI-NEXT:   [[PHI4:%[0-9]+]]:vgpr_32 = PHI [[PHI1]], %bb.1, undef %34:vgpr_32, %bb.4
-  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], 0, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.3
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.3.if:
@@ -227,7 +227,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
   ; SI-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec
-  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.6
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.1.Flow:
@@ -237,7 +237,7 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9
   ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9
   ; SI-NEXT:   [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9
-  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], 0, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.if:
@@ -350,7 +350,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed $vgpr1
   ; SI-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed $vgpr0
   ; SI-NEXT:   [[V_CMP_GT_I32_e64_:%[0-9]+]]:sreg_32 = V_CMP_GT_I32_e64 6, killed [[COPY5]], implicit $exec
-  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_GT_I32_e64_]], 0, %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.6
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.1.Flow:
@@ -359,7 +359,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
   ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9
   ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9
   ; SI-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9
-  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], 0, %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.2.if:
@@ -467,7 +467,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY killed $sgpr4_sgpr5
   ; SI-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY killed $vgpr0
   ; SI-NEXT:   [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_32 = V_CMP_NE_U32_e64 0, [[COPY1]](s32), implicit $exec
-  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32 = SI_IF killed [[V_CMP_NE_U32_e64_]], -1, %bb.5, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.2
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.1.if.then:
@@ -503,7 +503,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT:   successors: %bb.1(0x40000000), %bb.7(0x40000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %56:vgpr_32, %bb.6
-  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], -1, %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.1
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.6.sw.bb18:
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b5e4bcd049c42ae..215615744d7eb95 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -60,10 +60,9 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dword v2, v3, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dword v2, v3, s[2:3]
-; GFX906-NEXT:  .LBB1_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dword v1, v2, s[6:7]
@@ -136,10 +135,9 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v4, s[2:3]
-; GFX906-NEXT:  .LBB3_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dwordx2 v3, v[1:2], s[6:7]
@@ -172,10 +170,9 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v6, s[2:3]
-; GFX906-NEXT:  .LBB4_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    global_store_dwordx4 v5, v[1:4], s[6:7]
@@ -209,11 +206,10 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[0:1] offset:16
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[0:1]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx4 v[1:4], v10, s[2:3] offset:16
 ; GFX906-NEXT:    global_load_dwordx4 v[5:8], v10, s[2:3]
-; GFX906-NEXT:  .LBB5_2: ; %bb.2
+; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[6:7] offset:16
@@ -412,14 +408,13 @@ define amdgpu_kernel void @v8i8_phi_chain(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[8:9]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v3, s[10:11]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX906-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; GFX906-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
-; GFX906-NEXT:  .LBB8_2: ; %Flow
+; GFX906-NEXT:  ; %bb.2: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX906-NEXT:    s_cbranch_execz .LBB8_4
@@ -646,11 +641,10 @@ define amdgpu_kernel void @v8i8_multi_block(ptr addrspace(1) %src1, ptr addrspac
 ; GFX906-NEXT:    global_load_dwordx2 v[1:2], v6, s[10:11]
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 7, v0
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX906-NEXT:  ; %bb.2: ; %bb.2
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX906-NEXT:    global_store_dwordx2 v0, v[3:4], s[12:13]
-; GFX906-NEXT:  .LBB11_3: ; %Flow
+; GFX906-NEXT:  ; %bb.3: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:  .LBB11_4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
@@ -738,7 +732,6 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
 ; GFX906-NEXT:    s_movk_i32 s6, 0xff00
 ; GFX906-NEXT:    v_mov_b32_e32 v5, 8
@@ -761,7 +754,7 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    global_store_dword v3, v7, s[12:13] offset:8
 ; GFX906-NEXT:    global_store_dword v3, v6, s[12:13] offset:16
 ; GFX906-NEXT:    global_store_dword v3, v4, s[12:13] offset:24
-; GFX906-NEXT:  .LBB13_2: ; %Flow
+; GFX906-NEXT:  ; %bb.2: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX906-NEXT:    s_cbranch_execz .LBB13_4
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 883657547519ba6..5f7d8b680c13d63 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -80,10 +80,10 @@
   declare align 4 ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr() #1
 
   ; Function Attrs: convergent nocallback nofree nounwind willreturn
-  declare { i1, i64 } @llvm.amdgcn.if.i64(i1) #2
+  declare { i1, i64 } @llvm.amdgcn.if.i64(i1, i1) #2
 
   ; Function Attrs: convergent nocallback nofree nounwind willreturn
-  declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64) #2
+  declare { i1, i64 } @llvm.amdgcn.else.i64.i64(i64, i1) #2
 
   ; Function Attrs: convergent nocallback nofree nounwind willreturn memory(none)
   declare i64 @llvm.amdgcn.if.break.i64(i1, i64) #3
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
index b54ae64032d42ea..ac1113060abf089 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-metadata.mir
@@ -39,10 +39,10 @@
   declare void @llvm.memcpy.inline.p1.p1.i64(ptr addrspace(1) noalias nocapture writeonly, ptr addrspace(1) noalias nocapture readonly, i64 immarg, i1 immarg) #1
 
   ; Function Attrs: convergent nounwind willreturn
-  declare { i1, i32 } @llvm.amdgcn.if.i32(i1) #2
+  declare { i1, i32 } @llvm.amdgcn.if.i32(i1, i1) #2
 
   ; Function Attrs: convergent nounwind willreturn
-  declare { i1, i32 } @llvm.amdgcn.else.i32.i32(i32) #2
+  declare { i1, i32 } @llvm.amdgcn.else.i32.i32(i32, i1) #2
 
   ; Function Attrs: convergent nounwind readnone willreturn
   declare i32 @llvm.amdgcn.if.break.i32(i1, i32) #3
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir b/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
index c28a4405d488ccc..cc03d1d907b51c8 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir
@@ -15,10 +15,10 @@
   }
 
   ; Function Attrs: convergent nounwind
-  declare { i1, i64 } @llvm.amdgcn.if(i1) #1
+  declare { i1, i64 } @llvm.amdgcn.if(i1, i1) #1
 
   ; Function Attrs: convergent nounwind
-  declare { i1, i64 } @llvm.amdgcn.else(i64) #1
+  declare { i1, i64 } @llvm.amdgcn.else(i64, i1) #1
 
   ; Function Attrs: convergent nounwind readnone
   declare i64 @llvm.amdgcn.break(i64) #2

>From d2c31929cfbef35204bfe09da04b55954e640140 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Tue, 21 Jan 2025 08:39:42 -0500
Subject: [PATCH 2/5] Fix formatting

---
 llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9fb756232e83bc9..958950bb3c19ae7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7274,10 +7274,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
       B.setInsertPt(B.getMBB(), BrCond->getIterator());
       if (IntrID == Intrinsic::amdgcn_if) {
         B.buildInstr(AMDGPU::SI_IF)
-          .addDef(Def)
-          .addUse(Use)
-          .addImm(LikelyVarying)
-          .addMBB(UncondBrTarget);
+            .addDef(Def)
+            .addUse(Use)
+            .addImm(LikelyVarying)
+            .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
             .addDef(Def)

>From 46a5fa4244b313450838548d3c7d37fb48769d03 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Wed, 22 Jan 2025 03:38:04 -0500
Subject: [PATCH 3/5] Use "divergent" instead of "varying"

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  6 +--
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   | 45 ++++++++++---------
 llvm/lib/Target/AMDGPU/SIInstructions.td      | 10 ++---
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp |  8 ++--
 ... => annotate-likely-divergent-branches.ll} | 26 +++++------
 .../conditional-mem-no-cbranch-execz.ll       |  2 +-
 6 files changed, 49 insertions(+), 48 deletions(-)
 rename llvm/test/CodeGen/AMDGPU/{annotate-likely-varying-branches.ll => annotate-likely-divergent-branches.ll} (96%)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 958950bb3c19ae7..ad36834201fc544 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7264,7 +7264,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
       Register Def = MI.getOperand(1).getReg();
       Register Use = MI.getOperand(3).getReg();
-      auto LikelyVarying = MI.getOperand(4).getImm();
+      auto LikelyDivergent = MI.getOperand(4).getImm();
 
       MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
 
@@ -7276,13 +7276,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
         B.buildInstr(AMDGPU::SI_IF)
             .addDef(Def)
             .addUse(Use)
-            .addImm(LikelyVarying)
+            .addImm(LikelyDivergent)
             .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
             .addDef(Def)
             .addUse(Use)
-            .addImm(LikelyVarying)
+            .addImm(LikelyDivergent)
             .addMBB(UncondBrTarget);
       }
 
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 854eb7706416aeb..8a15d15a7e7567a 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -37,22 +37,22 @@ namespace {
 using StackEntry = std::pair<BasicBlock *, Value *>;
 using StackVector = SmallVector<StackEntry, 16>;
 
-class LikelyVaryingHeuristic {
+class DynamicDivergenceHeuristic {
 public:
-  LikelyVaryingHeuristic(const Function &F, const GCNSubtarget &ST) {
+  DynamicDivergenceHeuristic(const Function &F, const GCNSubtarget &ST) {
     IsSingleLaneExecution = ST.isSingleLaneExecution(F);
   }
 
-  /// Check if \p V is likely to be have dynamically varying values among the
+  /// Check if \p V is likely to be have dynamically diverging values among the
   /// workitems in each wavefront.
-  bool isLikelyVarying(const Value *V);
+  bool isLikelyDivergent(const Value *V);
 
 private:
   bool IsSingleLaneExecution = false;
 
-  bool isRelevantSourceOfDivergence(const Value *V) const;
+  bool isWorkitemID(const Value *V) const;
 
-  ValueMap<const Value *, bool> LikelyVaryingCache;
+  ValueMap<const Value *, bool> LikelyDivergentCache;
 };
 
 class SIAnnotateControlFlow {
@@ -81,7 +81,7 @@ class SIAnnotateControlFlow {
 
   LoopInfo *LI;
 
-  LikelyVaryingHeuristic LVHeuristic;
+  DynamicDivergenceHeuristic DivergenceHeuristic;
 
   void initialize(const GCNSubtarget &ST);
 
@@ -120,7 +120,7 @@ class SIAnnotateControlFlow {
 public:
   SIAnnotateControlFlow(Function &F, const GCNSubtarget &ST, DominatorTree &DT,
                         LoopInfo &LI, UniformityInfo &UA)
-      : F(&F), UA(&UA), DT(&DT), LI(&LI), LVHeuristic(F, ST) {
+      : F(&F), UA(&UA), DT(&DT), LI(&LI), DivergenceHeuristic(F, ST) {
     initialize(ST);
   }
 
@@ -209,12 +209,13 @@ bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
 
   // Check if it's likely that at least one lane will always follow the
   // then-branch, i.e., the then-branch is never skipped completly.
-  Value *IsLikelyVarying =
-      LVHeuristic.isLikelyVarying(Term->getCondition()) ? BoolTrue : BoolFalse;
+  Value *IsLikelyDivergent =
+      DivergenceHeuristic.isLikelyDivergent(Term->getCondition()) ? BoolTrue
+                                                                  : BoolFalse;
 
   IRBuilder<> IRB(Term);
   Value *IfCall = IRB.CreateCall(getDecl(If, Intrinsic::amdgcn_if, IntMask),
-                                 {Term->getCondition(), IsLikelyVarying});
+                                 {Term->getCondition(), IsLikelyDivergent});
   Value *Cond = IRB.CreateExtractValue(IfCall, {0});
   Value *Mask = IRB.CreateExtractValue(IfCall, {1});
   Term->setCondition(Cond);
@@ -231,13 +232,14 @@ bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
   Value *IncomingMask = popSaved();
   // Check if it's likely that at least one lane will always follow the
   // else-branch, i.e., the else-branch is never skipped completly.
-  Value *IsLikelyVarying =
-      LVHeuristic.isLikelyVarying(IncomingMask) ? BoolTrue : BoolFalse;
+  Value *IsLikelyDivergent = DivergenceHeuristic.isLikelyDivergent(IncomingMask)
+                                 ? BoolTrue
+                                 : BoolFalse;
 
   IRBuilder<> IRB(Term);
   Value *ElseCall =
       IRB.CreateCall(getDecl(Else, Intrinsic::amdgcn_else, {IntMask, IntMask}),
-                     {IncomingMask, IsLikelyVarying});
+                     {IncomingMask, IsLikelyDivergent});
   Value *Cond = IRB.CreateExtractValue(ElseCall, {0});
   Value *Mask = IRB.CreateExtractValue(ElseCall, {1});
   Term->setCondition(Cond);
@@ -418,8 +420,7 @@ bool SIAnnotateControlFlow::run() {
   return Changed;
 }
 
-bool LikelyVaryingHeuristic::isRelevantSourceOfDivergence(
-    const Value *V) const {
+bool DynamicDivergenceHeuristic::isWorkitemID(const Value *V) const {
   auto *II = dyn_cast<IntrinsicInst>(V);
   if (!II)
     return false;
@@ -439,11 +440,11 @@ bool LikelyVaryingHeuristic::isRelevantSourceOfDivergence(
   }
 }
 
-bool LikelyVaryingHeuristic::isLikelyVarying(const Value *V) {
+bool DynamicDivergenceHeuristic::isLikelyDivergent(const Value *V) {
   if (IsSingleLaneExecution)
     return false;
 
-  if (isRelevantSourceOfDivergence(V))
+  if (isWorkitemID(V))
     return true;
 
   auto *I = dyn_cast<Instruction>(V);
@@ -458,19 +459,19 @@ bool LikelyVaryingHeuristic::isLikelyVarying(const Value *V) {
     return false;
 
   // Have we already checked V?
-  auto CacheEntry = LikelyVaryingCache.find(V);
-  if (CacheEntry != LikelyVaryingCache.end())
+  auto CacheEntry = LikelyDivergentCache.find(V);
+  if (CacheEntry != LikelyDivergentCache.end())
     return CacheEntry->second;
 
   // Does it use a likely varying Value?
   bool Result = false;
   for (const auto &Use : I->operands()) {
-    Result |= isLikelyVarying(Use);
+    Result |= isLikelyDivergent(Use);
     if (Result)
       break;
   }
 
-  LikelyVaryingCache.insert({V, Result});
+  LikelyDivergentCache.insert({V, Result});
   return Result;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 5d0929a50f64101..18cfa56491780d7 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -417,8 +417,8 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
 let isTerminator = 1, isNotDuplicable = 1 in {
 
 def SI_IF: CFPseudoInstSI <
-  (outs SReg_1:$dst), (ins SReg_1:$vcc, i1imm:$likelyvarying, brtarget:$target),
-  [(set i1:$dst, (AMDGPUif i1:$vcc, (i1 timm:$likelyvarying), bb:$target))], 1, 1> {
+  (outs SReg_1:$dst), (ins SReg_1:$vcc, i1imm:$likelydivergent, brtarget:$target),
+  [(set i1:$dst, (AMDGPUif i1:$vcc, (i1 timm:$likelydivergent), bb:$target))], 1, 1> {
   let Constraints = "";
   let Size = 12;
   let hasSideEffects = 1;
@@ -427,7 +427,7 @@ def SI_IF: CFPseudoInstSI <
 
 def SI_ELSE : CFPseudoInstSI <
   (outs SReg_1:$dst),
-  (ins SReg_1:$src, i1imm:$likelyvarying, brtarget:$target), [], 1, 1> {
+  (ins SReg_1:$src, i1imm:$likelydivergent, brtarget:$target), [], 1, 1> {
   let Size = 12;
   let hasSideEffects = 1;
   let IsNeverUniform = 1;
@@ -1049,8 +1049,8 @@ def : GCNPat<
 >;
 
 def : GCNPat<
-  (AMDGPUelse i1:$src, i1:$likelyvarying, bb:$target),
-  (SI_ELSE $src, $likelyvarying, $target)
+  (AMDGPUelse i1:$src, i1:$likelydivergent, bb:$target),
+  (SI_ELSE $src, $likelydivergent, $target)
 >;
 
 def : Pat <
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index d59f3504a2e342d..e8a668838b1f5ef 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -227,7 +227,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineOperand &ImpDefSCC = MI.getOperand(5);
   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
 
-  bool LikelyVarying = MI.getOperand(2).getImm();
+  bool LikelyDivergent = MI.getOperand(2).getImm();
 
   // If there is only one use of save exec register and that use is SI_END_CF,
   // we can optimize SI_IF by returning the full saved exec mask instead of
@@ -288,7 +288,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
                             .add(MI.getOperand(3));
 
-  if (LikelyVarying) {
+  if (LikelyDivergent) {
     MachineBasicBlock *ExeczDest = MI.getOperand(3).getMBB();
     auto **E = MBB.succ_end();
     for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
@@ -344,7 +344,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   if (LV)
     LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
 
-  bool LikelyVarying = MI.getOperand(2).getImm();
+  bool LikelyDivergent = MI.getOperand(2).getImm();
 
   MachineBasicBlock *DestBB = MI.getOperand(3).getMBB();
 
@@ -369,7 +369,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
           .addMBB(DestBB);
 
-  if (LikelyVarying) {
+  if (LikelyDivergent) {
     auto **E = MBB.succ_end();
     for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
       if (*SI == DestBB)
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll b/llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll
similarity index 96%
rename from llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
rename to llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll
index 0bd80d2c6007c14..f3e59f9955015f5 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-likely-varying-branches.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y()
 
 declare { i1, i64 } @llvm.amdgcn.if.i64(i1, i1)
 
-; The branch here is likely varying:
+; The branch here is likely dynamically divergent:
 define amdgpu_kernel void @cond_store_even(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2:[0-9]+]] !reqd_work_group_size [[META0:![0-9]+]] {
@@ -48,7 +48,7 @@ exit:
 }
 
 
-; The branch here is likely varying:
+; The branch here is likely dynamically divergent:
 define amdgpu_kernel void @cond_store_even_ann_cf(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ann_cf(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
@@ -98,7 +98,7 @@ exit:
 }
 
 
-; The branch here is likely varying:
+; The branch here is likely dynamically divergent:
 define amdgpu_kernel void @cond_store_complex1(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_complex1(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
@@ -138,7 +138,7 @@ exit:
 }
 
 
-; The branch here is likely varying:
+; The branch here is likely dynamically divergent:
 define amdgpu_kernel void @cond_store_complex2(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_complex2(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
@@ -178,7 +178,7 @@ exit:
 }
 
 
-; The branch here is likely varying:
+; The branch here is likely dynamically divergent:
 define amdgpu_kernel void @cond_store_even_only_reqd_wgsz(ptr addrspace(1) inreg %dest) !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_only_reqd_wgsz(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR3:[0-9]+]] !reqd_work_group_size [[META0]] {
@@ -220,7 +220,7 @@ exit:
 }
 
 
-; The branch here is likely varying:
+; The branch here is likely dynamically divergent:
 define amdgpu_kernel void @cond_store_even_only_flat_wgsz(ptr addrspace(1) inreg %dest) #0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_only_flat_wgsz(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] {
@@ -262,7 +262,7 @@ exit:
 }
 
 
-; The branch here is likely varying, since the y dimension varies in each
+; The branch here is likely dynamically divergent, since the y dimension varies in each
 ; wavefront with the required work group size:
 define amdgpu_kernel void @cond_store_even_ydim_small_wgs(ptr addrspace(1) inreg %dest) !reqd_work_group_size !1 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ydim_small_wgs(
@@ -305,7 +305,7 @@ exit:
 }
 
 
-; The branch here is likely varying, even though there are no attributes with
+; The branch here is likely dynamically divergent, even though there are no attributes with
 ; work group size information:
 define amdgpu_kernel void @cond_store_even_no_attributes(ptr addrspace(1) inreg %dest) {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_no_attributes(
@@ -348,7 +348,7 @@ exit:
 }
 
 
-; The branch here is likely varying, even though the condition only depends on a
+; The branch here is likely dynamically divergent, even though the condition only depends on a
 ; workitem id dimension that does not vary per wavefront (namely y):
 define amdgpu_kernel void @cond_store_even_ydim(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ydim(
@@ -391,7 +391,7 @@ exit:
 }
 
 
-; The branch here is not likely varying, because its condition is directly
+; The branch here is not likely dynamically divergent, because its condition is directly
 ; loaded from memory:
 define amdgpu_kernel void @cond_store_loaded(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_loaded(
@@ -436,7 +436,7 @@ exit:
 }
 
 
-; The branch here is not likely varying, because its condition directly results from a PHI:
+; The branch here is not likely dynamically divergent, because its condition directly results from a PHI:
 define amdgpu_kernel void @cond_store_loop_phi(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_loop_phi(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
@@ -497,7 +497,7 @@ exit:
   ret void
 }
 
-; The then and else branches are likely varying here:
+; The then and else branches are likely dynamically divergent here:
 define amdgpu_kernel void @cond_store_even_ifelse(ptr addrspace(1) inreg %dest) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_even_ifelse(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
@@ -553,7 +553,7 @@ exit:
 }
 
 
-; The then and else branches are not likely varying here:
+; The then and else branches are not likely dynamically divergent here:
 define amdgpu_kernel void @cond_store_loaded_ifelse(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_loaded_ifelse(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
diff --git a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
index 8fc9e357969beea..7cab4a88d17272b 100644
--- a/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
+++ b/llvm/test/CodeGen/AMDGPU/conditional-mem-no-cbranch-execz.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
 
 ; Check that simple conditional memory accesses that are guarded by likely
-; varying conditions are not lowered with an s_cbranch_execz to bypass them.
+; divergent conditions are not lowered with an s_cbranch_execz to bypass them.
 ; Instructions like s_waitcnt vmcnt(0) block the elimination of s_cbranch_execz.
 
 declare i32 @llvm.amdgcn.workitem.id.x()

>From 6de1df7cf9956021804413d0ad9eb797d9997547 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Thu, 23 Jan 2025 05:29:17 -0500
Subject: [PATCH 4/5] Also traverse phis and selects when looking for
 transitive uses of the workitem id.

---
 .../Target/AMDGPU/SIAnnotateControlFlow.cpp   |   9 +-
 .../annotate-likely-divergent-branches.ll     | 155 +++++++++++++++++-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll |   6 +-
 3 files changed, 158 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 8a15d15a7e7567a..372df832af8f463 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -52,6 +52,7 @@ class DynamicDivergenceHeuristic {
 
   bool isWorkitemID(const Value *V) const;
 
+  DenseSet<const Value *> Visited;
   ValueMap<const Value *, bool> LikelyDivergentCache;
 };
 
@@ -453,9 +454,9 @@ bool DynamicDivergenceHeuristic::isLikelyDivergent(const Value *V) {
 
   // ExtractValueInst and IntrinsicInst enable looking through the
   // amdgcn_if/else intrinsics inserted by SIAnnotateControlFlow.
-  // This condition excludes PHINodes, which prevents infinite recursion.
   if (!isa<BinaryOperator>(I) && !isa<UnaryOperator>(I) && !isa<CastInst>(I) &&
-      !isa<CmpInst>(I) && !isa<ExtractValueInst>(I) && !isa<IntrinsicInst>(I))
+      !isa<CmpInst>(I) && !isa<ExtractValueInst>(I) && !isa<IntrinsicInst>(I) &&
+      !isa<PHINode>(I) && !isa<SelectInst>(I))
     return false;
 
   // Have we already checked V?
@@ -463,6 +464,10 @@ bool DynamicDivergenceHeuristic::isLikelyDivergent(const Value *V) {
   if (CacheEntry != LikelyDivergentCache.end())
     return CacheEntry->second;
 
+  // Have we hit a cycle?
+  if (!Visited.insert(V).second)
+    return false;
+
   // Does it use a likely varying Value?
   bool Result = false;
   for (const auto &Use : I->operands()) {
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll b/llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll
index f3e59f9955015f5..bc975f89a9ee371 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-likely-divergent-branches.ll
@@ -436,7 +436,7 @@ exit:
 }
 
 
-; The branch here is not likely dynamically divergent, because its condition directly results from a PHI:
+; The branch here is likely dynamically divergent, even though it only uses the workitem id through a PHI:
 define amdgpu_kernel void @cond_store_loop_phi(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
 ; CHECK-LABEL: define amdgpu_kernel void @cond_store_loop_phi(
 ; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
@@ -455,10 +455,8 @@ define amdgpu_kernel void @cond_store_loop_phi(ptr addrspace(1) inreg %dest, ptr
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp eq i32 [[IDX_DEC]], 0
 ; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP_END:.*]], label %[[LOOP]]
 ; CHECK:       [[LOOP_END]]:
-; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
-; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[LOOKUP_VALUE]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 false)
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
 ; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
 ; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
@@ -485,9 +483,154 @@ loop:
   %loop.cond = icmp eq i32 %idx.dec, 0
   br i1 %loop.cond, label %loop.end, label %loop
 loop.end:
+  %cond = icmp eq i32 %val, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+; The branch here is not likely dynamically divergent, since it doesn't use the workitem id:
+define amdgpu_kernel void @cond_store_loop_unrelated_phi(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_loop_unrelated_phi(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[LOOKUP_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[LOOKUP]], i64 [[TID_EXT]]
+; CHECK-NEXT:    [[LOOKUP_VALUE:%.*]] = load i32, ptr addrspace(1) [[LOOKUP_ADDR]], align 4
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[VAL:%.*]] = phi i32 [ [[VAL_INC:%.*]], %[[LOOP]] ], [ [[LOOKUP_VALUE]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[IDX:%.*]] = phi i32 [ [[IDX_DEC:%.*]], %[[LOOP]] ], [ [[N]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[VAL_INC]] = add i32 [[VAL]], 1
+; CHECK-NEXT:    [[IDX_DEC]] = sub i32 [[IDX]], 1
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp eq i32 [[IDX_DEC]], 0
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label %[[LOOP_END:.*]], label %[[LOOP]]
+; CHECK:       [[LOOP_END]]:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[VAL]], 20
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 false)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
   %lookup.addr = getelementptr i32, ptr addrspace(1) %lookup, i64 %tid.ext
   %lookup.value = load i32, ptr addrspace(1) %lookup.addr
-  %cond = icmp eq i32 %lookup.value, 0
+  br label %loop
+loop:
+  %val = phi i32 [%val.inc, %loop], [%lookup.value, %entry]
+  %idx = phi i32 [%idx.dec, %loop], [%n, %entry]
+  %val.inc = add i32 %val, 1
+  %idx.dec = sub i32 %idx, 1
+  %loop.cond = icmp eq i32 %idx.dec, 0
+  br i1 %loop.cond, label %loop.end, label %loop
+loop.end:
+  %cond = icmp eq i32 %val, 20
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+; The branch here is likely dynamically divergent, even though it only uses the workitem id through a select:
+define amdgpu_kernel void @cond_store_select_cond(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_select_cond(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[COND_SELECT:%.*]] = icmp eq i32 [[TID]], 13
+; CHECK-NEXT:    [[VAL:%.*]] = select i1 [[COND_SELECT]], i32 0, i32 1
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %cond.select = icmp eq i32 %tid, 13
+  %val = select i1 %cond.select, i32 0, i32 1
+  %cond = icmp eq i32 %val, 0
+  br i1 %cond, label %do.store, label %exit
+do.store:
+  %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
+  store i32 0, ptr addrspace(1) %local.addr
+  br label %exit
+exit:
+  ret void
+}
+
+; The branch here is likely dynamically divergent, even though it only uses the workitem id through a select:
+define amdgpu_kernel void @cond_store_select_val(ptr addrspace(1) inreg %dest, ptr addrspace(1) inreg %lookup, i32 %n) #0 !reqd_work_group_size !0 {
+; CHECK-LABEL: define amdgpu_kernel void @cond_store_select_val(
+; CHECK-SAME: ptr addrspace(1) inreg [[DEST:%.*]], ptr addrspace(1) inreg [[LOOKUP:%.*]], i32 [[N:%.*]]) #[[ATTR2]] !reqd_work_group_size [[META0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TID_X:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[TID_Y:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.y()
+; CHECK-NEXT:    [[TID_Y_SHIFT:%.*]] = shl nuw nsw i32 [[TID_Y]], 6
+; CHECK-NEXT:    [[TID:%.*]] = or disjoint i32 [[TID_X]], [[TID_Y_SHIFT]]
+; CHECK-NEXT:    [[TID_EXT:%.*]] = zext nneg i32 [[TID]] to i64
+; CHECK-NEXT:    [[COND_SELECT:%.*]] = icmp eq i32 [[N]], 13
+; CHECK-NEXT:    [[VAL:%.*]] = select i1 [[COND_SELECT]], i32 0, i32 [[TID]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[VAL]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i1, i64 } @llvm.amdgcn.if.i64(i1 [[COND]], i1 true)
+; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { i1, i64 } [[TMP0]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { i1, i64 } [[TMP0]], 1
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[DO_STORE:.*]], label %[[EXIT:.*]]
+; CHECK:       [[DO_STORE]]:
+; CHECK-NEXT:    [[LOCAL_ADDR:%.*]] = getelementptr i32, ptr addrspace(1) [[DEST]], i64 [[TID_EXT]]
+; CHECK-NEXT:    store i32 0, ptr addrspace(1) [[LOCAL_ADDR]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.y = tail call i32 @llvm.amdgcn.workitem.id.y()
+  %tid.y.shift = shl nuw nsw i32 %tid.y, 6
+  %tid = or disjoint i32 %tid.x, %tid.y.shift
+  %tid.ext = zext nneg i32 %tid to i64
+  %cond.select = icmp eq i32 %n, 13
+  %val = select i1 %cond.select, i32 0, i32 %tid
+  %cond = icmp eq i32 %val, 0
   br i1 %cond, label %do.store, label %exit
 do.store:
   %local.addr = getelementptr i32, ptr addrspace(1) %dest, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 215615744d7eb95..46e2fb1db48fce8 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -563,7 +563,6 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:  .LBB10_2: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
-; GFX906-NEXT:    s_cbranch_execz .LBB10_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v10
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v8
@@ -585,7 +584,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    v_mov_b32_e32 v15, v7
 ; GFX906-NEXT:    v_mov_b32_e32 v12, v6
 ; GFX906-NEXT:    v_mov_b32_e32 v16, v5
-; GFX906-NEXT:  .LBB10_4: ; %bb.3
+; GFX906-NEXT:  ; %bb.4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v13
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
@@ -757,7 +756,6 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:  ; %bb.2: ; %Flow
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX906-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
-; GFX906-NEXT:    s_cbranch_execz .LBB13_4
 ; GFX906-NEXT:  ; %bb.3: ; %bb.2
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v2
 ; GFX906-NEXT:    v_and_b32_e32 v4, 0xffffff00, v2
@@ -779,7 +777,7 @@ define amdgpu_kernel void @v8i8_multiuse_multiblock(ptr addrspace(1) %src1, ptr
 ; GFX906-NEXT:    global_store_dword v0, v4, s[14:15] offset:8
 ; GFX906-NEXT:    global_store_dword v0, v7, s[14:15] offset:16
 ; GFX906-NEXT:    global_store_dword v0, v2, s[14:15] offset:24
-; GFX906-NEXT:  .LBB13_4: ; %bb.3
+; GFX906-NEXT:  ; %bb.4: ; %bb.3
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX906-NEXT:    s_movk_i32 s3, 0xff00
 ; GFX906-NEXT:    v_mov_b32_e32 v4, 8

>From 5e11857b6f64ba3c7f80c0ea1cd9803bf7d78839 Mon Sep 17 00:00:00 2001
From: Fabian Ritter <fabian.ritter at amd.com>
Date: Fri, 24 Jan 2025 08:23:00 -0500
Subject: [PATCH 5/5] Extract common functionality into setSuccessorUnlikely

---
 llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp | 32 +++++++++++--------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index e8a668838b1f5ef..4812fbe30e34fc9 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -216,6 +216,21 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
   return true;
 }
 
+/// Mark Succ as an unlikely successor of MBB.
+static void setSuccessorUnlikely(MachineBasicBlock &MBB,
+                                 const MachineBasicBlock *Succ) {
+  auto **E = MBB.succ_end();
+  bool Found = false;
+  for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
+    if (*SI == Succ) {
+      MBB.setSuccProbability(SI, BranchProbability::getZero());
+      Found = true;
+    }
+  }
+  assert(Found && "Succ must be a successor of MBB!");
+  MBB.normalizeSuccProbs();
+}
+
 void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
@@ -290,12 +305,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
 
   if (LikelyDivergent) {
     MachineBasicBlock *ExeczDest = MI.getOperand(3).getMBB();
-    auto **E = MBB.succ_end();
-    for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
-      if (*SI == ExeczDest)
-        MBB.setSuccProbability(SI, BranchProbability::getZero());
-    }
-    MBB.normalizeSuccProbs();
+    setSuccessorUnlikely(MBB, ExeczDest);
   }
 
   if (!LIS) {
@@ -369,14 +379,8 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
           .addMBB(DestBB);
 
-  if (LikelyDivergent) {
-    auto **E = MBB.succ_end();
-    for (auto **SI = MBB.succ_begin(); SI != E; ++SI) {
-      if (*SI == DestBB)
-        MBB.setSuccProbability(SI, BranchProbability::getZero());
-    }
-    MBB.normalizeSuccProbs();
-  }
+  if (LikelyDivergent)
+    setSuccessorUnlikely(MBB, DestBB);
 
   if (!LIS) {
     MI.eraseFromParent();



More information about the llvm-commits mailing list