[llvm] 5df2af8 - [AMDGPU] Merge SIRemoveShortExecBranches into SIPreEmitPeephole

Fri Mar 19 19:27:28 PDT 2021

Author: Carl Ritson
Date: 2021-03-20T11:26:42+09:00
New Revision: 5df2af8b0ef33f48b1ee72bcd27bc609b898da52

URL: https://github.com/llvm/llvm-project/commit/5df2af8b0ef33f48b1ee72bcd27bc609b898da52
DIFF: https://github.com/llvm/llvm-project/commit/5df2af8b0ef33f48b1ee72bcd27bc609b898da52.diff

LOG: [AMDGPU] Merge SIRemoveShortExecBranches into SIPreEmitPeephole

SIRemoveShortExecBranches is an optimisation so fits well in the
context of SIPreEmitPeephole.

Test changes relate to early termination from kills which have now
been lowered prior to considering branches for removal.
As these use s_cbranch the execz skips are now retained instead.
Currently either behaviour is valid as kill with EXEC=0 is a nop;
however, if early termination is used differently in future then
the new behaviour is the correct one.

Reviewed By: foad

Differential Revision: https://reviews.llvm.org/D98917

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPU.h
    llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/lib/Target/AMDGPU/CMakeLists.txt
    llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
    llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
    llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
    llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
    llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
    llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
    llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
    llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
    llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn

Removed: 
    llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index cdd59fe0b847..4f9f888506b7 100644

--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -204,9 +204,6 @@ extern char &SIWholeQuadModeID;
 void initializeSILowerControlFlowPass(PassRegistry &);
 extern char &SILowerControlFlowID;
 
-void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
-extern char &SIRemoveShortExecBranchesID;
-
 void initializeSIPreEmitPeepholePass(PassRegistry &);
 extern char &SIPreEmitPeepholeID;
 

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 9db4e8c8472f..2b42f9e1281e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -249,7 +249,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
-  initializeSIRemoveShortExecBranchesPass(*PR);
   initializeSIPreEmitPeepholePass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIMemoryLegalizerPass(*PR);
@@ -1215,7 +1214,6 @@ void GCNPassConfig::addPreEmitPass() {
   if (getOptLevel() > CodeGenOpt::None)
     addPass(&SIInsertHardClausesID);
 
-  addPass(&SIRemoveShortExecBranchesID);
   addPass(&SIInsertSkipsPassID);
   addPass(&SIPreEmitPeepholeID);
   // The hazard recognizer that runs as part of the post-ra scheduler does not

diff  --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 7aa256821167..03b0c0f45f2d 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -137,7 +137,6 @@ add_llvm_target(AMDGPUCodeGen
   SIPreEmitPeephole.cpp
   SIProgramInfo.cpp
   SIRegisterInfo.cpp
-  SIRemoveShortExecBranches.cpp
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp

diff  --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 5f10fefa469f..93d33fddff52 100644
--- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -21,6 +21,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "si-pre-emit-peephole"
 
+static unsigned SkipThreshold;
+
+static cl::opt<unsigned, true> SkipThresholdFlag(
+    "amdgpu-skip-threshold", cl::Hidden,
+    cl::desc(
+        "Number of instructions before jumping over divergent control flow"),
+    cl::location(SkipThreshold), cl::init(12));
+
 namespace {
 
 class SIPreEmitPeephole : public MachineFunctionPass {
@@ -30,6 +38,13 @@ class SIPreEmitPeephole : public MachineFunctionPass {
 
   bool optimizeVccBranch(MachineInstr &MI) const;
   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+  bool getBlockDestinations(MachineBasicBlock &SrcMBB,
+                            MachineBasicBlock *&TrueMBB,
+                            MachineBasicBlock *&FalseMBB,
+                            SmallVectorImpl<MachineOperand> &Cond);
+  bool mustRetainExeczBranch(const MachineBasicBlock &From,
+                             const MachineBasicBlock &To) const;
+  bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
 
 public:
   static char ID;
@@ -258,6 +273,74 @@ bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
   return true;
 }
 
+bool SIPreEmitPeephole::getBlockDestinations(
+    MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
+    MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
+  if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
+    return false;
+
+  if (!FalseMBB)
+    FalseMBB = SrcMBB.getNextNode();
+
+  return true;
+}
+
+bool SIPreEmitPeephole::mustRetainExeczBranch(
+    const MachineBasicBlock &From, const MachineBasicBlock &To) const {
+  unsigned NumInstr = 0;
+  const MachineFunction *MF = From.getParent();
+
+  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
+       MBBI != End && MBBI != ToI; ++MBBI) {
+    const MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      // When a uniform loop is inside non-uniform control flow, the branch
+      // leaving the loop might never be taken when EXEC = 0.
+      // Hence we should retain cbranch out of the loop lest it become infinite.
+      if (I->isConditionalBranch())
+        return true;
+
+      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
+        return true;
+
+      // These instructions are potentially expensive even if EXEC = 0.
+      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
+          TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
+        return true;
+
+      ++NumInstr;
+      if (NumInstr >= SkipThreshold)
+        return true;
+    }
+  }
+
+  return false;
+}
+
+// Returns true if the skip branch instruction is removed.
+bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
+                                          MachineBasicBlock &SrcMBB) {
+  MachineBasicBlock *TrueMBB = nullptr;
+  MachineBasicBlock *FalseMBB = nullptr;
+  SmallVector<MachineOperand, 1> Cond;
+
+  if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
+    return false;
+
+  // Consider only the forward branches.
+  if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
+      mustRetainExeczBranch(*FalseMBB, *TrueMBB))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
+  MI.eraseFromParent();
+  SrcMBB.removeSuccessor(TrueMBB);
+
+  return true;
+}
+
 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -265,10 +348,12 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
   bool Changed = false;
 
+  MF.RenumberBlocks();
+
   for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
     MachineBasicBlock::iterator TermI = MBBE;
-    // Check first terminator for VCC branches to optimize
+    // Check first terminator for branches to optimize
     if (TermI != MBB.end()) {
       MachineInstr &MI = *TermI;
       switch (MI.getOpcode()) {
@@ -276,6 +361,9 @@ bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
       case AMDGPU::S_CBRANCH_VCCNZ:
         Changed |= optimizeVccBranch(MI);
         continue;
+      case AMDGPU::S_CBRANCH_EXECZ:
+        Changed |= removeExeczBranch(MI, MBB);
+        continue;
       default:
         break;
       }

diff  --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
deleted file mode 100644
index 104dea8fdff5..000000000000
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-- SIRemoveShortExecBranches.cpp ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This pass optmizes the s_cbranch_execz instructions.
-/// The pass removes this skip instruction for short branches,
-/// if there is no unwanted sideeffect in the fallthrough code sequence.
-///
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "si-remove-short-exec-branches"
-
-static unsigned SkipThreshold;
-
-static cl::opt<unsigned, true> SkipThresholdFlag(
-    "amdgpu-skip-threshold", cl::Hidden,
-    cl::desc(
-        "Number of instructions before jumping over divergent control flow"),
-    cl::location(SkipThreshold), cl::init(12));
-
-namespace {
-
-class SIRemoveShortExecBranches : public MachineFunctionPass {
-private:
-  const SIInstrInfo *TII = nullptr;
-  bool getBlockDestinations(MachineBasicBlock &SrcMBB,
-                            MachineBasicBlock *&TrueMBB,
-                            MachineBasicBlock *&FalseMBB,
-                            SmallVectorImpl<MachineOperand> &Cond);
-  bool mustRetainExeczBranch(const MachineBasicBlock &From,
-                             const MachineBasicBlock &To) const;
-  bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
-
-public:
-  static char ID;
-
-  SIRemoveShortExecBranches() : MachineFunctionPass(ID) {
-    initializeSIRemoveShortExecBranchesPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-};
-
-} // End anonymous namespace.
-
-INITIALIZE_PASS(SIRemoveShortExecBranches, DEBUG_TYPE,
-                "SI remove short exec branches", false, false)
-
-char SIRemoveShortExecBranches::ID = 0;
-
-char &llvm::SIRemoveShortExecBranchesID = SIRemoveShortExecBranches::ID;
-
-bool SIRemoveShortExecBranches::getBlockDestinations(
-    MachineBasicBlock &SrcMBB, MachineBasicBlock *&TrueMBB,
-    MachineBasicBlock *&FalseMBB, SmallVectorImpl<MachineOperand> &Cond) {
-  if (TII->analyzeBranch(SrcMBB, TrueMBB, FalseMBB, Cond))
-    return false;
-
-  if (!FalseMBB)
-    FalseMBB = SrcMBB.getNextNode();
-
-  return true;
-}
-
-bool SIRemoveShortExecBranches::mustRetainExeczBranch(
-    const MachineBasicBlock &From, const MachineBasicBlock &To) const {
-  unsigned NumInstr = 0;
-  const MachineFunction *MF = From.getParent();
-
-  for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
-       MBBI != End && MBBI != ToI; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      // When a uniform loop is inside non-uniform control flow, the branch
-      // leaving the loop might never be taken when EXEC = 0.
-      // Hence we should retain cbranch out of the loop lest it become infinite.
-      if (I->isConditionalBranch())
-        return true;
-
-      if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
-        return true;
-
-      if (TII->isKillTerminator(I->getOpcode()))
-        return true;
-
-      // These instructions are potentially expensive even if EXEC = 0.
-      if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
-          TII->isDS(*I) || I->getOpcode() == AMDGPU::S_WAITCNT)
-        return true;
-
-      ++NumInstr;
-      if (NumInstr >= SkipThreshold)
-        return true;
-    }
-  }
-
-  return false;
-}
-
-// Returns true if the skip branch instruction is removed.
-bool SIRemoveShortExecBranches::removeExeczBranch(MachineInstr &MI,
-                                                  MachineBasicBlock &SrcMBB) {
-  MachineBasicBlock *TrueMBB = nullptr;
-  MachineBasicBlock *FalseMBB = nullptr;
-  SmallVector<MachineOperand, 1> Cond;
-
-  if (!getBlockDestinations(SrcMBB, TrueMBB, FalseMBB, Cond))
-    return false;
-
-  // Consider only the forward branches.
-  if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
-      mustRetainExeczBranch(*FalseMBB, *TrueMBB))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
-  MI.eraseFromParent();
-  SrcMBB.removeSuccessor(TrueMBB);
-
-  return true;
-}
-
-bool SIRemoveShortExecBranches::runOnMachineFunction(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  TII = ST.getInstrInfo();
-  MF.RenumberBlocks();
-  bool Changed = false;
-
-  for (MachineBasicBlock &MBB : MF) {
-    MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
-    if (MBBI == MBB.end())
-      continue;
-
-    MachineInstr &MI = *MBBI;
-    switch (MI.getOpcode()) {
-    case AMDGPU::S_CBRANCH_EXECZ:
-      Changed = removeExeczBranch(MI, MBB);
-      break;
-    default:
-      break;
-    }
-  }
-
-  return Changed;
-}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
index f0d76065ddd5..1b8689d10a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll
@@ -166,12 +166,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; SI-NEXT:    s_xor_b64 s[2:3], vcc, -1
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB2_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB2_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB2_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -192,12 +193,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX9-NEXT:    s_xor_b64 s[2:3], vcc, -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB2_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB2_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -218,12 +220,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    s_xor_b32 s1, vcc_lo, -1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB2_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
 ; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -244,12 +247,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], vcc, -1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB2_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -284,13 +288,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB3_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB3_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB3_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -312,13 +317,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB3_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB3_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -340,13 +346,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB3_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
@@ -368,13 +375,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB3_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
@@ -416,13 +424,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB4_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB4_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB4_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v0
 ; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -444,13 +453,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB4_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB4_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -472,13 +482,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB4_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
@@ -500,13 +511,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB4_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -660,13 +672,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB6_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
-; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:  BB6_3: ; %.continue0
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[4:5]
@@ -682,12 +695,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; SI-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[6:7]
+; SI-NEXT:    s_cbranch_execz BB6_6
 ; SI-NEXT:  ; %bb.4: ; %.demote1
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.5: ; %.demote1
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:  BB6_6: ; %.continue1
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    v_mov_b32_e32 v0, s2
 ; SI-NEXT:    v_mov_b32_e32 v1, s3
@@ -706,13 +720,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB6_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:  BB6_3: ; %.continue0
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -728,12 +743,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB6_6
 ; GFX9-NEXT:  ; %bb.4: ; %.demote1
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.5: ; %.demote1
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:  BB6_6: ; %.continue1
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -752,13 +768,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s2, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:  BB6_3: ; %.continue0
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    s_mov_b32 s1, s0
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
@@ -772,12 +789,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    s_xor_b32 s1, s1, -1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:  BB6_6: ; %.continue1
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -796,13 +814,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:  BB6_3: ; %.continue0
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -816,12 +835,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], s[2:3], -1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:  BB6_6: ; %.continue1
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -883,13 +903,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB7_9
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[8:9], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[8:9]
-; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:  BB7_3: ; %.continue0.preheader
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
@@ -948,13 +969,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB7_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s4
@@ -1013,13 +1035,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s3, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX10-32-NEXT:    s_branch BB7_5
@@ -1075,13 +1098,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], 0

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
index 6ce629a0dc05..7b37990dfa45 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
index 5424ad39b4d9..95b537367219 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs  %s -o - | FileCheck %s
 # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0
 
 ---

diff  --git a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
index 928324492d51..97c8b50c50cb 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
+++ b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-remove-short-exec-branches -amdgpu-skip-threshold=3 %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=3 %s -o - | FileCheck %s
 
 ---
 

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
index 0b0fb98cacb8..9edd1a397b78 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll
@@ -167,12 +167,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; SI-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB2_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; SI-NEXT:    s_cbranch_scc0 BB2_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB2_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; SI-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -194,12 +195,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB2_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB2_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX9-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -221,12 +223,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-32-NEXT:    v_cmp_eq_u32_e64 s0, 1, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s0
 ; GFX10-32-NEXT:    s_xor_b32 s0, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s1, s1, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB2_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
 ; GFX10-32-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -248,12 +251,13 @@ define amdgpu_ps void @branch(float %arg0, float %arg1) {
 ; GFX10-64-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_xor_b64 s[0:1], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB2_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB2_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB2_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
 ; GFX10-64-NEXT:    exp mrt1 v0, v0, v0, v0 done vm
@@ -289,13 +293,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB3_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB3_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB3_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
@@ -317,13 +322,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB3_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB3_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
@@ -345,13 +351,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v1
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB3_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-32-NEXT:    s_waitcnt vmcnt(0)
@@ -373,13 +380,14 @@ define amdgpu_ps <4 x float> @wqm_demote_1(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v1
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB3_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB3_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB3_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    image_sample v0, v0, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
 ; GFX10-64-NEXT:    s_waitcnt vmcnt(0)
@@ -421,13 +429,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; SI-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; SI-NEXT:    s_cbranch_execz BB4_3
 ; SI-NEXT:  ; %bb.1: ; %.demote
 ; SI-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; SI-NEXT:    s_cbranch_scc0 BB4_4
 ; SI-NEXT:  ; %bb.2: ; %.demote
 ; SI-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; SI-NEXT:    s_and_b64 exec, exec, s[16:17]
-; SI-NEXT:  ; %bb.3: ; %.continue
+; SI-NEXT:  BB4_3: ; %.continue
 ; SI-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; SI-NEXT:    v_add_f32_e32 v0, v0, v0
 ; SI-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -449,13 +458,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], exec, s[14:15]
+; GFX9-NEXT:    s_cbranch_execz BB4_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote
 ; GFX9-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX9-NEXT:  ; %bb.2: ; %.demote
 ; GFX9-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX9-NEXT:  ; %bb.3: ; %.continue
+; GFX9-NEXT:  BB4_3: ; %.continue
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[14:15]
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -477,13 +487,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s13, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s13, exec_lo, s13
+; GFX10-32-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-32-NEXT:    s_andn2_b32 s12, s12, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-32-NEXT:    s_wqm_b32 s28, s12
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s28
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue
+; GFX10-32-NEXT:  BB4_3: ; %.continue
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s13
 ; GFX10-32-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s12
@@ -505,13 +516,14 @@ define amdgpu_ps <4 x float> @wqm_demote_2(<8 x i32> inreg %rsrc, <4 x i32> inre
 ; GFX10-64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[14:15], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[28:29], exec, s[14:15]
+; GFX10-64-NEXT:    s_cbranch_execz BB4_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote
 ; GFX10-64-NEXT:    s_andn2_b64 s[12:13], s[12:13], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB4_4
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote
 ; GFX10-64-NEXT:    s_wqm_b64 s[16:17], s[12:13]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[16:17]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue
+; GFX10-64-NEXT:  BB4_3: ; %.continue
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[28:29]
 ; GFX10-64-NEXT:    v_add_f32_e32 v0, v0, v0
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[12:13]
@@ -659,13 +671,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:    s_cbranch_execz BB6_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[4:5]
-; SI-NEXT:  ; %bb.3: ; %.continue0
+; SI-NEXT:  BB6_3: ; %.continue0
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -681,12 +694,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; SI-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB6_6
 ; SI-NEXT:  ; %bb.4: ; %.demote1
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB6_7
 ; SI-NEXT:  ; %bb.5: ; %.demote1
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.6: ; %.continue1
+; SI-NEXT:  BB6_6: ; %.continue1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    v_bfrev_b32_e32 v0, 60
 ; SI-NEXT:    v_mov_b32_e32 v1, 0x3c00
@@ -705,13 +719,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX9-NEXT:    s_cbranch_execz BB6_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0
+; GFX9-NEXT:  BB6_3: ; %.continue0
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -727,12 +742,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB6_6
 ; GFX9-NEXT:  ; %bb.4: ; %.demote1
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX9-NEXT:  ; %bb.5: ; %.demote1
 ; GFX9-NEXT:    s_mov_b64 exec, 0
-; GFX9-NEXT:  ; %bb.6: ; %.continue1
+; GFX9-NEXT:  BB6_6: ; %.continue1
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX9-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -751,13 +767,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-32-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s2, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s2
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-32-NEXT:  BB6_3: ; %.continue0
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    s_mov_b32 s1, s0
 ; GFX10-32-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s1
@@ -771,12 +788,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-32-NEXT:    s_or_b32 s1, s1, vcc_lo
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX10-32-NEXT:    s_xor_b32 s1, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-32-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-32-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-32-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-32-NEXT:  BB6_6: ; %.continue1
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-32-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-32-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -795,13 +813,14 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[4:5], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[4:5]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0
+; GFX10-64-NEXT:  BB6_3: ; %.continue0
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GFX10-64-NEXT:    v_cndmask_b32_e64 v0, 1.0, 0, s[2:3]
@@ -815,12 +834,13 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) {
 ; GFX10-64-NEXT:    s_or_b64 s[2:3], s[2:3], vcc
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10-64-NEXT:    s_xor_b64 s[2:3], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB6_6
 ; GFX10-64-NEXT:  ; %bb.4: ; %.demote1
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB6_7
 ; GFX10-64-NEXT:  ; %bb.5: ; %.demote1
 ; GFX10-64-NEXT:    s_mov_b64 exec, 0
-; GFX10-64-NEXT:  ; %bb.6: ; %.continue1
+; GFX10-64-NEXT:  BB6_6: ; %.continue1
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX10-64-NEXT:    v_mov_b32_e32 v0, 0x3c00
 ; GFX10-64-NEXT:    v_bfrev_b32_e32 v1, 60
@@ -875,13 +895,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB7_3
 ; SI-NEXT:  ; %bb.1: ; %.demote0
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB7_9
 ; SI-NEXT:  ; %bb.2: ; %.demote0
 ; SI-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; SI-NEXT:    s_and_b64 exec, exec, s[6:7]
-; SI-NEXT:  ; %bb.3: ; %.continue0.preheader
+; SI-NEXT:  BB7_3: ; %.continue0.preheader
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b64 s[4:5], 0
 ; SI-NEXT:    s_branch BB7_5
@@ -940,13 +961,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execz BB7_3
 ; GFX9-NEXT:  ; %bb.1: ; %.demote0
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX9-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX9-NEXT:  ; %bb.2: ; %.demote0
 ; GFX9-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX9-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX9-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX9-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX9-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9-NEXT:    s_branch BB7_5
@@ -1005,13 +1027,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-32-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX10-32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-32-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-32-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-32-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-32-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-32-NEXT:    s_wqm_b32 s3, s0
 ; GFX10-32-NEXT:    s_and_b32 exec_lo, exec_lo, s3
-; GFX10-32-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-32-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-32-NEXT:    s_mov_b32 s2, 0
 ; GFX10-32-NEXT:    s_branch BB7_5
@@ -1067,13 +1090,14 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
 ; GFX10-64-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; GFX10-64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10-64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-64-NEXT:    s_cbranch_execz BB7_3
 ; GFX10-64-NEXT:  ; %bb.1: ; %.demote0
 ; GFX10-64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-64-NEXT:    s_cbranch_scc0 BB7_9
 ; GFX10-64-NEXT:  ; %bb.2: ; %.demote0
 ; GFX10-64-NEXT:    s_wqm_b64 s[6:7], s[0:1]
 ; GFX10-64-NEXT:    s_and_b64 exec, exec, s[6:7]
-; GFX10-64-NEXT:  ; %bb.3: ; %.continue0.preheader
+; GFX10-64-NEXT:  BB7_3: ; %.continue0.preheader
 ; GFX10-64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10-64-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX10-64-NEXT:    s_branch BB7_5

diff  --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
index 0f0d210799a9..3dddb0fef230 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
+++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
 # Make sure mandatory skips are not removed around mode defs.
 # FIXME: -amdgpu-skip-threshold seems to be backwards.
 

diff  --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
index ee72fa99a129..58b1ab9ace01 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs  %s -o - | FileCheck %s
 # Make sure mandatory skips are not removed around mode defs.
 # FIXME: -amdgpu-skip-threshold seems to be backwards.
 

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
index 5979720d0cc7..4c53c51d1ce4 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
+++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-remove-short-exec-branches -amdgpu-skip-threshold=1000000 -o -  %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=1000000 -o -  %s | FileCheck %s
 
 ---
 name: skip_branch_taildup_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
index f535e28c6718..690fe5a7e683 100644
--- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -1002,13 +1002,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; SI-NEXT:    v_cmp_nle_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; SI-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; SI-NEXT:    s_cbranch_execz BB13_3
 ; SI-NEXT:  ; %bb.1: ; %bb3
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
 ; SI-NEXT:    s_cbranch_scc0 BB13_6
 ; SI-NEXT:  ; %bb.2: ; %bb3
 ; SI-NEXT:    s_andn2_b64 exec, exec, vcc
-; SI-NEXT:  ; %bb.3: ; %bb4
+; SI-NEXT:  BB13_3: ; %bb4
 ; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; SI-NEXT:    s_mov_b32 s1, s0
 ; SI-NEXT:    s_mov_b32 s2, s0
@@ -1043,13 +1044,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX10-WAVE64-NEXT:    s_mov_b32 s0, 0
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX10-WAVE64-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX10-WAVE64-NEXT:    s_cbranch_execz BB13_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE64-NEXT:    v_cmp_ngt_f32_e32 vcc, 0, v0
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[2:3], s[2:3], vcc
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB13_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 exec, exec, vcc
-; GFX10-WAVE64-NEXT:  ; %bb.3: ; %bb4
+; GFX10-WAVE64-NEXT:  BB13_3: ; %bb4
 ; GFX10-WAVE64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10-WAVE64-NEXT:    s_mov_b32 s1, s0
 ; GFX10-WAVE64-NEXT:    s_mov_b32 s2, s0
@@ -1082,13 +1084,14 @@ define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, float %arg2,
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s0, 0
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_xor_b32 s2, exec_lo, s2
+; GFX10-WAVE32-NEXT:    s_cbranch_execz BB13_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %bb3
 ; GFX10-WAVE32-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0, v0
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s1, s1, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 BB13_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %bb3
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 exec_lo, exec_lo, vcc_lo
-; GFX10-WAVE32-NEXT:  ; %bb.3: ; %bb4
+; GFX10-WAVE32-NEXT:  BB13_3: ; %bb4
 ; GFX10-WAVE32-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s1, s0
 ; GFX10-WAVE32-NEXT:    s_mov_b32 s2, s0
@@ -1154,12 +1157,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; SI-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; SI-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; SI-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; SI-NEXT:    s_cbranch_execz BB14_3
 ; SI-NEXT:  ; %bb.1: ; %kill
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; SI-NEXT:    s_cbranch_scc0 BB14_6
 ; SI-NEXT:  ; %bb.2: ; %kill
 ; SI-NEXT:    s_mov_b64 exec, 0
-; SI-NEXT:  ; %bb.3: ; %Flow
+; SI-NEXT:  BB14_3: ; %Flow
 ; SI-NEXT:    s_or_saveexec_b64 s[0:1], s[2:3]
 ; SI-NEXT:    ; implicit-def: $vgpr2
 ; SI-NEXT:    s_xor_b64 exec, exec, s[0:1]
@@ -1190,12 +1194,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE64-NEXT:    v_cmp_ge_f32_e32 vcc, 0, v1
 ; GFX10-WAVE64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX10-WAVE64-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX10-WAVE64-NEXT:    s_cbranch_execz BB14_3
 ; GFX10-WAVE64-NEXT:  ; %bb.1: ; %kill
 ; GFX10-WAVE64-NEXT:    s_andn2_b64 s[0:1], s[0:1], exec
 ; GFX10-WAVE64-NEXT:    s_cbranch_scc0 BB14_6
 ; GFX10-WAVE64-NEXT:  ; %bb.2: ; %kill
 ; GFX10-WAVE64-NEXT:    s_mov_b64 exec, 0
-; GFX10-WAVE64-NEXT:  ; %bb.3: ; %Flow
+; GFX10-WAVE64-NEXT:  BB14_3: ; %Flow
 ; GFX10-WAVE64-NEXT:    s_or_saveexec_b64 s[0:1], s[2:3]
 ; GFX10-WAVE64-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-WAVE64-NEXT:    s_xor_b64 exec, exec, s[0:1]
@@ -1226,12 +1231,13 @@ define amdgpu_ps void @cbranch_kill(i32 inreg %0, float %val0, float %val1) {
 ; GFX10-WAVE32-NEXT:    v_cmp_ge_f32_e32 vcc_lo, 0, v1
 ; GFX10-WAVE32-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-WAVE32-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-WAVE32-NEXT:    s_cbranch_execz BB14_3
 ; GFX10-WAVE32-NEXT:  ; %bb.1: ; %kill
 ; GFX10-WAVE32-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-WAVE32-NEXT:    s_cbranch_scc0 BB14_6
 ; GFX10-WAVE32-NEXT:  ; %bb.2: ; %kill
 ; GFX10-WAVE32-NEXT:    s_mov_b32 exec_lo, 0
-; GFX10-WAVE32-NEXT:  ; %bb.3: ; %Flow
+; GFX10-WAVE32-NEXT:  BB14_3: ; %Flow
 ; GFX10-WAVE32-NEXT:    s_or_saveexec_b32 s0, s1
 ; GFX10-WAVE32-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-WAVE32-NEXT:    s_xor_b32 exec_lo, exec_lo, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
index 9f42347f5ec8..e5a019e5d04a 100644
--- a/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
+++ b/llvm/test/CodeGen/AMDGPU/transform-block-with-return-to-epilog.ll
@@ -108,25 +108,26 @@ define amdgpu_ps { <4 x float> } @test_return_to_epilog_with_optimized_kill(floa
   ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3, $sgpr4_sgpr5
   ; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
   ; GCN: bb.4.Flow1:
-  ; GCN:   successors: %bb.5(0x40000000)
+  ; GCN:   successors: %bb.5(0x40000000), %bb.7(0x40000000)
   ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
   ; GCN:   renamable $sgpr2_sgpr3 = S_OR_SAVEEXEC_B64 killed renamable $sgpr2_sgpr3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GCN:   $exec = S_XOR_B64 $exec, renamable $sgpr2_sgpr3, implicit-def $scc
+  ; GCN:   S_CBRANCH_EXECZ %bb.7, implicit $exec
   ; GCN: bb.5.kill0:
-  ; GCN:   successors: %bb.8(0x40000000), %bb.7(0x40000000)
+  ; GCN:   successors: %bb.6(0x40000000), %bb.8(0x40000000)
   ; GCN:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
   ; GCN:   dead renamable $sgpr0_sgpr1 = S_ANDN2_B64 killed renamable $sgpr0_sgpr1, $exec, implicit-def $scc
-  ; GCN:   S_CBRANCH_SCC0 %bb.7, implicit $scc
-  ; GCN: bb.8.kill0:
-  ; GCN:   successors: %bb.6(0x80000000)
+  ; GCN:   S_CBRANCH_SCC0 %bb.8, implicit $scc
+  ; GCN: bb.6.kill0:
+  ; GCN:   successors: %bb.7(0x80000000)
   ; GCN:   liveins: $sgpr2_sgpr3, $scc
   ; GCN:   $exec = S_MOV_B64 0
-  ; GCN: bb.6.end:
+  ; GCN: bb.7.end:
   ; GCN:   successors: %bb.9(0x80000000)
   ; GCN:   liveins: $sgpr2_sgpr3
   ; GCN:   $exec = S_OR_B64 $exec, killed renamable $sgpr2_sgpr3, implicit-def $scc
   ; GCN:   S_BRANCH %bb.9
-  ; GCN: bb.7:
+  ; GCN: bb.8:
   ; GCN:   $exec = S_MOV_B64 0
   ; GCN:   EXP_DONE 9, undef $vgpr0, undef $vgpr0, undef $vgpr0, undef $vgpr0, 1, 0, 0, implicit $exec
   ; GCN:   S_ENDPGM 0

diff  --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index 6b50aa58bd6b..3693e706bec4 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -222,7 +222,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "SIPreEmitPeephole.cpp",
     "SIProgramInfo.cpp",
     "SIRegisterInfo.cpp",
-    "SIRemoveShortExecBranches.cpp",
     "SIShrinkInstructions.cpp",
     "SIWholeQuadMode.cpp",
   ]