[llvm] GlobalISel lane masks merging (PR #73337)

Petar Avramovic via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 4 07:54:12 PST 2023


https://github.com/petar-avramovic updated https://github.com/llvm/llvm-project/pull/73337

>From cd0cae6a2b53f6dc9b454ddfbfa5d9b6614e975c Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Fri, 24 Nov 2023 14:56:51 +0100
Subject: [PATCH 1/6] AMDGPU/GlobalISel: add AMDGPUGlobalISelDivergenceLowering
 pass

Add empty AMDGPUGlobalISelDivergenceLowering pass.
This pass will implement
- selection of divergent i1 phis as lane mask phis,
  requires lane mask merging in some cases
- lower uses of divergent i1 values outside of the cycle
  using lane mask merging
- lowering of all cases of temporal divergence:
- lower uses of uniform i1 values outside of the cycle
  using lane mask merging
- lower uses of uniform non-i1 values outside of the cycle
  using a copy to vgpr inside of the cycle

Add very detailed set of regression tests for cases mentioned above.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    4 +
 .../AMDGPUGlobalISelDivergenceLowering.cpp    |   66 ++
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    2 +
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    1 +
 ...-divergent-i1-phis-no-lane-mask-merging.ll |  341 ++++++
 ...divergent-i1-phis-no-lane-mask-merging.mir |  610 ++++++++++
 ...vergence-divergent-i1-used-outside-loop.ll |  512 +++++++++
 ...ergence-divergent-i1-used-outside-loop.mir |  914 +++++++++++++++
 .../GlobalISel/divergence-structurizer.ll     |  461 ++++++++
 .../GlobalISel/divergence-structurizer.mir    | 1004 +++++++++++++++++
 .../divergence-temporal-divergent-i1.ll       |  171 +++
 .../divergence-temporal-divergent-i1.mir      |  324 ++++++
 .../divergence-temporal-divergent-reg.ll      |   37 +
 .../divergence-temporal-divergent-reg.mir     |   70 ++
 14 files changed, 4517 insertions(+)
 create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 323560a46f31d..007d64944244a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -36,6 +36,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
 FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass();
 FunctionPass *createSIWholeQuadModePass();
@@ -162,6 +163,9 @@ extern char &SILowerWWMCopiesID;
 void initializeSILowerI1CopiesPass(PassRegistry &);
 extern char &SILowerI1CopiesID;
 
+void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
+extern char &AMDGPUGlobalISelDivergenceLoweringID;
+
 void initializeSILowerSGPRSpillsPass(PassRegistry &);
 extern char &SILowerSGPRSpillsID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
new file mode 100644
index 0000000000000..5a4ae0d00b561
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -0,0 +1,66 @@
+//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// GlobalISel pass that selects divergent i1 phis as lane mask phis.
+// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
+// Handles all cases of temporal divergence.
+//
+// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
+// currently depends on LCSSA to insert phis with one incoming.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+#define DEBUG_TYPE "global-isel-divergence-lowering"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {
+    initializeAMDGPUGlobalISelDivergenceLoweringPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "GlobalISel divergence lowering";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
+                      "GlobalISel divergence lowering", false, false)
+INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
+                    "GlobalISel divergence lowering", false, false)
+
+char AMDGPUGlobalISelDivergenceLowering::ID = 0;
+
+char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
+    AMDGPUGlobalISelDivergenceLowering::ID;
+
+FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
+  return new AMDGPUGlobalISelDivergenceLowering();
+}
+
+bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
+    MachineFunction &MF) {
+  return true;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0c38fa32c6f33..1d0be8984604d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -375,6 +375,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUDAGToDAGISelPass(*PR);
   initializeGCNDPPCombinePass(*PR);
   initializeSILowerI1CopiesPass(*PR);
+  initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
   initializeSILowerWWMCopiesPass(*PR);
   initializeSILowerSGPRSpillsPass(*PR);
   initializeSIFixSGPRCopiesPass(*PR);
@@ -1255,6 +1256,7 @@ bool GCNPassConfig::addLegalizeMachineIR() {
 void GCNPassConfig::addPreRegBankSelect() {
   bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
   addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+  addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
 }
 
 bool GCNPassConfig::addRegBankSelect() {
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 53a33f8210d2a..2c92e7a073885 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -55,6 +55,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUCtorDtorLowering.cpp
   AMDGPUExportClustering.cpp
   AMDGPUFrameLowering.cpp
+  AMDGPUGlobalISelDivergenceLowering.cpp
   AMDGPUGlobalISelUtils.cpp
   AMDGPUHSAMetadataStreamer.cpp
   AMDGPUInsertDelayAlu.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
new file mode 100644
index 0000000000000..b909d78d5e874
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+; Divergent phis that don't require lowering using lane mask merging
+
+; - divergent phi that has divergent incoming value (this makes it divergent)
+;   but is reachable through only one path - branch instruction that chooses
+;   path is uniform
+
+; - divergent phi that is used only inside the loop and has incoming from
+;   previous iteration. After phi-elimination (rewrite lane mask in phi def with
+;   lane mask value from previous iteration), phi will hold lane mask valid for
+;   current iteration which is fine since it is not used outside of the loop.
+
+; And one more that is tricky (is branch divergent or not ?)
+; "amdgpu-flat-work-group-size"="1,1" aka single lane execution does not stop
+; shader from activating multiple lanes by using some intrinsic (entering wwm
+; and using dpp instructions)
+; - there are cases with single lane execution where branch instructions are not
+;   lowered to si_if (or other intrinsic branches) - with intention to use
+;   uniform branch after instruction selection?
+;   PhiIncomingAnalysis does not recognize G_BRCOND as divergent branch and does
+;   not perform lane mask merging
+
+
+
+define amdgpu_ps void @divergent_i1_phi_uniform_branch(i32 addrspace(1)* %out, i32 %tid, i32 inreg %cond, i32 addrspace(1)* %dummyaddr) {
+; GFX10-LABEL: divergent_i1_phi_uniform_branch:
+; GFX10:       ; %bb.0: ; %A
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_cbranch_scc0 .LBB0_2
+; GFX10-NEXT:  ; %bb.1:
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT:    s_branch .LBB0_3
+; GFX10-NEXT:  .LBB0_2: ; %dummy
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0x7b
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT:    global_store_dword v[3:4], v5, off
+; GFX10-NEXT:  .LBB0_3: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+A:
+  %val_A = icmp uge i32 %tid, 6
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %dummy, label %exit
+
+dummy:
+  store i32 123, i32 addrspace(1)* %dummyaddr
+  br label %B
+
+B:
+  %val_B = icmp ult i32 %tid, 1
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %sel = select i1 %phi, i32 1, i32 2
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; Fix me - there is no need to merge lane masks here
+define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(i32 addrspace(1)* %out, i32 %tid, i32 inreg %cond) {
+; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
+; GFX10:       ; %bb.0: ; %A
+; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
+; GFX10-NEXT:  ; %bb.1:
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT:    s_branch .LBB1_3
+; GFX10-NEXT:  .LBB1_2: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT:  .LBB1_3: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+A:
+  %val_A = icmp uge i32 %tid, 6
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %B, label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 1
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %sel = select i1 %phi, i32 1, i32 2
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+
+; Divergent i1 phi that uses value from previous iteration.
+; Used only inside the loop (variable name is bool_counter)
+define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
+; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:  .LBB2_1: ; %loop
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    flat_store_dword v[1:2], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop ]
+  %bool_counter = phi i1 [ true, %entry ], [ %neg_bool_counter, %loop ]
+  %neg_bool_counter = xor i1 %bool_counter, true
+  %fcounter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %fcounter, %val
+  %counterPlus1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %select = select i1 %neg_bool_counter, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, float %pre_cond_val, ptr %addr, ptr %addr_if, ptr %addr_else) {
+; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0x3e8
+; GFX10-NEXT:    v_mov_b32_e32 v9, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    s_branch .LBB3_2
+; GFX10-NEXT:  .LBB3_1: ; %loop_body
+; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v10, v9
+; GFX10-NEXT:    v_xor_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB3_6
+; GFX10-NEXT:  .LBB3_2: ; %loop_start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v9
+; GFX10-NEXT:    s_mov_b32 s5, 1
+; GFX10-NEXT:    s_cbranch_vccz .LBB3_4
+; GFX10-NEXT:  ; %bb.3: ; %else
+; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_store_dword v[6:7], v8
+; GFX10-NEXT:  .LBB3_4: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT:    s_xor_b32 s5, s5, 1
+; GFX10-NEXT:    s_and_b32 s5, s5, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX10-NEXT:  ; %bb.5: ; %if
+; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; GFX10-NEXT:    flat_store_dword v[4:5], v8
+; GFX10-NEXT:    s_branch .LBB3_1
+; GFX10-NEXT:  .LBB3_6: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %pre_cond = fcmp ogt float %pre_cond_val, 1.0
+  br label %loop_start
+
+loop_start:
+  %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop_body ]
+  %bool_counter = phi i1 [ %pre_cond, %entry ], [ %neg_bool_counter, %loop_body ]
+  %cond_break = icmp sgt i32 %counter, 1000
+  br i1 %cond_break, label %if, label %else
+
+if:
+  store i32 1000, ptr %addr_if
+  br label %loop_body
+
+else:
+  store i32 1000, ptr %addr_else
+  br label %loop_body
+
+loop_body:
+  %neg_bool_counter = xor i1 %bool_counter, true
+  %fcounter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %fcounter, %val
+  %counterPlus1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop_start
+
+exit:
+  %select = select i1 %neg_bool_counter, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+; There is a divergent, according to machine uniformity info, g_brcond branch
+; here, not lowered to si_if because of "amdgpu-flat-work-group-size"="1,1".
+define dllexport amdgpu_cs void @_amdgpu_cs_main( i32 inreg noundef %.userdata0, <3 x i32> inreg noundef %.WorkgroupId, <3 x i32> noundef %.LocalInvocationId) #0 {
+; GFX10-LABEL: _amdgpu_cs_main:
+; GFX10:       ; %bb.0: ; %.entry
+; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_mov_b32 s13, -1
+; GFX10-NEXT:    s_mov_b32 s2, s0
+; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
+; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GFX10-NEXT:    s_mov_b32 s2, 1
+; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v2, -1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    ; implicit-def: $vgpr3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
+; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
+; GFX10-NEXT:    v_mov_b32_e32 v4, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s12
+; GFX10-NEXT:  .LBB4_2: ; %.preheader
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    buffer_load_dword v5, v4, s[4:7], 0 offen
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 4, v4
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v5, v3
+; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
+; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
+; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:  .LBB4_4: ; %Flow
+; GFX10-NEXT:    s_and_b32 s2, s2, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-NEXT:    s_cbranch_scc0 .LBB4_6
+; GFX10-NEXT:  ; %bb.5: ; %.19
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
+; GFX10-NEXT:    v_or_b32_e32 v3, 2, v1
+; GFX10-NEXT:  .LBB4_6: ; %.22
+; GFX10-NEXT:    v_add_lshl_u32 v0, v0, s1, 2
+; GFX10-NEXT:    buffer_store_dword v3, v0, s[8:11], 0 offen
+; GFX10-NEXT:    s_endpgm
+.entry:
+  %.0 = call i64 @llvm.amdgcn.s.getpc()
+  %.1 = and i64 %.0, -4294967296
+  %.2 = zext i32 %.userdata0 to i64
+  %.3 = or i64 %.1, %.2
+  %.4 = inttoptr i64 %.3 to ptr addrspace(4)
+  %.5 = getelementptr i8, ptr addrspace(4) %.4, i64 16
+  %.6 = load <4 x i32>, ptr addrspace(4) %.5, align 16
+  %.7 = load <4 x i32>, ptr addrspace(4) %.4, align 16
+  %.8 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
+  %.9 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %.8)
+  %.fr11 = freeze i32 %.9
+  %.idx = shl i32 %.fr11, 2
+  %.10 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %.7, i32 %.idx, i32 0, i32 0)
+  %.11 = icmp eq i32 %.10, 0
+  %.12 = and i32 %.fr11, 1
+  %.not = icmp eq i32 %.12, 0
+  br i1 %.not, label %.19, label %.preheader
+
+.preheader:                                       ; preds = %.entry, %.preheader
+  %._96.02 = phi i32 [ %.15, %.preheader ], [ 0, %.entry ]
+  %._50.01 = phi i32 [ %.14, %.preheader ], [ 0, %.entry ]
+  %.idx5 = shl i32 %._96.02, 2
+  %.13 = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %.7, i32 %.idx5, i32 0, i32 0)
+  %.14 = add i32 %.13, %._50.01
+  %.15 = add nuw i32 %._96.02, 1
+  %.exitcond.not = icmp eq i32 %.15, %.fr11
+  br i1 %.exitcond.not, label %.preheader._crit_edge, label %.preheader
+
+.preheader._crit_edge:                            ; preds = %.preheader
+  %.16 = icmp eq i32 %.14, %.10
+  %.17 = or i1 %.11, %.16
+  %.18 = zext i1 %.17 to i32
+  br label %.22
+
+.19:                                               ; preds = %.entry
+  %.20 = zext i1 %.11 to i32
+  %.21 = or i32 %.20, 2
+  br label %.22
+
+.22:                                               ; preds = %.19, %.preheader._crit_edge
+  %._51.0 = phi i32 [ %.18, %.preheader._crit_edge ], [ %.21, %.19 ]
+  %.WorkgroupId.i0 = extractelement <3 x i32> %.WorkgroupId, i64 0
+  %.LocalInvocationId.i0 = extractelement <3 x i32> %.LocalInvocationId, i64 0
+  %.i0 = add i32 %.LocalInvocationId.i0, %.WorkgroupId.i0
+  %.idx6 = shl i32 %.i0, 2
+  call void @llvm.amdgcn.raw.buffer.store.i32(i32 %._51.0, <4 x i32> %.6, i32 %.idx6, i32 0, i32 0)
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare i64 @llvm.amdgcn.s.getpc()
+
+; Function Attrs: nounwind willreturn memory(none)
+declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.amdgcn.raw.buffer.store.i32(i32, <4 x i32>, i32, i32, i32 immarg)
+
+attributes #0 = { nounwind memory(readwrite) "amdgpu-flat-work-group-size"="1,1" }
+
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
new file mode 100644
index 0000000000000..a46bc505ceb59
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -0,0 +1,610 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+--- |
+  define void @divergent_i1_phi_uniform_branch() {ret void}
+  define void @divergent_i1_phi_uniform_branch_simple() {ret void}
+  define void @divergent_i1_phi_used_inside_loop() {ret void}
+  define void @divergent_i1_phi_used_inside_loop_bigger_loop_body() {ret void}
+  define void @_amdgpu_cs_main() #0 {ret void}
+
+  attributes #0 = {"amdgpu-flat-work-group-size"="1,1"}
+...
+
+---
+name: divergent_i1_phi_uniform_branch
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_uniform_branch
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; GFX10-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
+  ; GFX10-NEXT:   G_BR %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 123
+  ; GFX10-NEXT:   G_STORE [[C2]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]]
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $sgpr0
+    %5:_(s32) = COPY $vgpr3
+    %6:_(s32) = COPY $vgpr4
+    %7:_(p1) = G_MERGE_VALUES %5(s32), %6(s32)
+    %8:_(s32) = G_CONSTANT i32 6
+    %9:_(s1) = G_ICMP intpred(uge), %3(s32), %8
+    %10:_(s32) = G_CONSTANT i32 0
+    %11:_(s1) = G_ICMP intpred(ne), %4(s32), %10
+    G_BRCOND %11(s1), %bb.2
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %12:_(s32) = G_CONSTANT i32 123
+    G_STORE %12(s32), %7(p1) :: (store (s32), addrspace 1)
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.4(0x80000000)
+
+    %13:_(s1) = G_PHI %14(s1), %bb.3, %9(s1), %bb.0
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.2(0x80000000)
+
+    %15:_(s32) = G_CONSTANT i32 1
+    %14:_(s1) = G_ICMP intpred(ult), %3(s32), %15
+    G_BR %bb.2
+
+  bb.4:
+    %16:_(s32) = G_CONSTANT i32 2
+    %17:_(s32) = G_CONSTANT i32 1
+    %18:_(s32) = G_SELECT %13(s1), %17, %16
+    G_STORE %18(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_i1_phi_uniform_branch_simple
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_uniform_branch_simple
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x30000000), %bb.2(0x50000000)
+  ; GFX10-NEXT:   liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
+  ; GFX10-NEXT:   G_BR %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: $sgpr0, $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $sgpr0
+    %5:_(s32) = G_CONSTANT i32 6
+    %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(s1) = G_ICMP intpred(ne), %4(s32), %7
+    G_BRCOND %8(s1), %bb.2
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %9:_(s32) = G_CONSTANT i32 1
+    %10:_(s1) = G_ICMP intpred(ult), %3(s32), %9
+
+  bb.2:
+    %11:_(s1) = G_PHI %6(s1), %bb.0, %10(s1), %bb.1
+    %12:_(s32) = G_CONSTANT i32 2
+    %13:_(s32) = G_CONSTANT i32 1
+    %14:_(s32) = G_SELECT %11(s1), %13, %12
+    G_STORE %14(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_i1_phi_used_inside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_used_inside_loop
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s1) = G_CONSTANT i1 true
+    %5:_(s32) = G_CONSTANT i32 0
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+    %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s1) = G_PHI %4(s1), %bb.0, %11(s1), %bb.1
+    %12:_(s1) = G_CONSTANT i1 true
+    %11:_(s1) = G_XOR %10, %12
+    %13:_(s32) = G_UITOFP %8(s32)
+    %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
+    %15:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %15
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %16:_(s1) = G_PHI %11(s1), %bb.1
+    %17:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+    %18:_(s32) = G_FCONSTANT float 0.000000e+00
+    %19:_(s32) = G_FCONSTANT float 1.000000e+00
+    %20:_(s32) = G_SELECT %16(s1), %19, %18
+    G_STORE %20(s32), %3(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: divergent_i1_phi_used_inside_loop_bigger_loop_body
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_used_inside_loop_bigger_loop_body
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI1]](s32), [[C3]]
+  ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.4
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
+  ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.5
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+  ; GFX10-NEXT:   G_STORE [[C5]](s32), [[MV1]](p0) :: (store (s32))
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
+  ; GFX10-NEXT:   G_STORE [[C7]](s32), [[MV2]](p0) :: (store (s32))
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C8]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[XOR1]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C11]], [[C10]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = COPY $vgpr4
+    %6:_(s32) = COPY $vgpr5
+    %7:_(p0) = G_MERGE_VALUES %5(s32), %6(s32)
+    %8:_(s32) = COPY $vgpr6
+    %9:_(s32) = COPY $vgpr7
+    %10:_(p0) = G_MERGE_VALUES %8(s32), %9(s32)
+    %11:_(s32) = G_CONSTANT i32 0
+    %12:_(s32) = G_FCONSTANT float 1.000000e+00
+    %13:_(s1) = G_FCMP floatpred(ogt), %1(s32), %12
+
+  bb.1:
+    successors: %bb.4(0x40000000), %bb.2(0x40000000)
+
+    %14:_(s32) = G_PHI %15(s32), %bb.5, %11(s32), %bb.0
+    %16:_(s32) = G_PHI %11(s32), %bb.0, %17(s32), %bb.5
+    %18:_(s1) = G_PHI %13(s1), %bb.0, %19(s1), %bb.5
+    %20:_(s1) = G_CONSTANT i1 true
+    %21:_(s32) = G_CONSTANT i32 1000
+    %22:_(s1) = G_ICMP intpred(sle), %16(s32), %21
+    G_BRCOND %22(s1), %bb.4
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.5(0x40000000)
+
+    %23:_(s1) = G_PHI %24(s1), %bb.4, %20(s1), %bb.1
+    %25:_(s1) = G_CONSTANT i1 true
+    %26:_(s1) = G_XOR %23, %25
+    G_BRCOND %26(s1), %bb.5
+    G_BR %bb.3
+
+  bb.3:
+    successors: %bb.5(0x80000000)
+
+    %27:_(s32) = G_CONSTANT i32 1000
+    G_STORE %27(s32), %7(p0) :: (store (s32))
+    G_BR %bb.5
+
+  bb.4:
+    successors: %bb.2(0x80000000)
+
+    %24:_(s1) = G_CONSTANT i1 false
+    %28:_(s32) = G_CONSTANT i32 1000
+    G_STORE %28(s32), %10(p0) :: (store (s32))
+    G_BR %bb.2
+
+  bb.5:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %29:_(s1) = G_CONSTANT i1 true
+    %19:_(s1) = G_XOR %18, %29
+    %30:_(s32) = G_UITOFP %16(s32)
+    %31:_(s1) = G_FCMP floatpred(ogt), %30(s32), %0
+    %32:_(s32) = G_CONSTANT i32 1
+    %17:_(s32) = G_ADD %16, %32
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %31(s1), %14(s32)
+    SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.6:
+    %33:_(s1) = G_PHI %19(s1), %bb.5
+    %34:_(s32) = G_PHI %15(s32), %bb.5
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+    %35:_(s32) = G_FCONSTANT float 0.000000e+00
+    %36:_(s32) = G_FCONSTANT float 1.000000e+00
+    %37:_(s32) = G_SELECT %33(s1), %36, %35
+    G_STORE %37(s32), %4(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: _amdgpu_cs_main
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: _amdgpu_cs_main
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.getpc)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -4294967296
+  ; GFX10-NEXT:   [[AND:%[0-9]+]]:_(s64) = G_AND [[INT]], [[C]]
+  ; GFX10-NEXT:   [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+  ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[ZEXT]]
+  ; GFX10-NEXT:   [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[INTTOPTR]](p4) :: (load (<8 x s32>))
+  ; GFX10-NEXT:   [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
+  ; GFX10-NEXT:   [[TRUNC:%[0-9]+]]:_(s128) = G_TRUNC [[BITCAST]](s256)
+  ; GFX10-NEXT:   [[BITCAST1:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[TRUNC]](s128)
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[INT1:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.lo), [[C2]](s32), [[C1]](s32)
+  ; GFX10-NEXT:   [[INT2:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.hi), [[C2]](s32), [[INT1]](s32)
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s32) = G_FREEZE [[INT2]]
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FREEZE]], [[C3]](s32)
+  ; GFX10-NEXT:   [[AMDGPU_BUFFER_LOAD:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[BITCAST1]](<4 x s32>), [[C1]](s32), [[SHL]], [[C1]], 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_BUFFER_LOAD]](s32), [[C1]]
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[FREEZE]], [[C4]]
+  ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[AND1]](s32)
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC1]], [[C5]]
+  ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.2
+  ; GFX10-NEXT:   G_BR %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
+  ; GFX10-NEXT:   G_BRCOND [[PHI1]](s1), %bb.5
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %34(s32), %bb.3, [[C6]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %36(s32), %bb.3, [[FREEZE]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %38(s32), %bb.3, [[C6]](s32), %bb.1
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[AMDGPU_BUFFER_LOAD1:%[0-9]+]]:_(s32) = G_AMDGPU_BUFFER_LOAD [[BITCAST1]](<4 x s32>), [[C7]](s32), [[PHI2]], [[C7]], 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AMDGPU_BUFFER_LOAD1]], [[PHI4]]
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C8]]
+  ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; GFX10-NEXT:   [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[C7]]
+  ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.3
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.3
+  ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[PHI5]](s32), [[AMDGPU_BUFFER_LOAD]]
+  ; GFX10-NEXT:   [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
+  ; GFX10-NEXT:   [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
+  ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[C11]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI]](s32), %bb.2, [[OR2]](s32), %bb.5
+  ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>)
+  ; GFX10-NEXT:   [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY1]]
+  ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ADD3]], [[C12]](s32)
+  ; GFX10-NEXT:   [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   G_AMDGPU_BUFFER_STORE [[PHI6]](s32), [[UV1]](<4 x s32>), [[C13]](s32), [[SHL1]], [[C13]], 0, 0, 0 :: (store (s32), align 1, addrspace 8)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $vgpr0
+    %3:_(s32) = G_IMPLICIT_DEF
+    %4:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.s.getpc)
+    %5:_(s64) = G_CONSTANT i64 -4294967296
+    %6:_(s64) = G_AND %4, %5
+    %7:_(s64) = G_ZEXT %0(s32)
+    %8:_(s64) = G_OR %6, %7
+    %9:_(p4) = G_INTTOPTR %8(s64)
+    %10:_(<8 x s32>) = G_LOAD %9(p4) :: (load (<8 x s32>))
+    %11:_(s256) = G_BITCAST %10(<8 x s32>)
+    %12:_(s128) = G_TRUNC %11(s256)
+    %13:_(<4 x s32>) = G_BITCAST %12(s128)
+    %15:_(s32) = G_CONSTANT i32 0
+    %14:_(s32) = G_CONSTANT i32 -1
+    %16:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.lo), %14(s32), %15(s32)
+    %17:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.mbcnt.hi), %14(s32), %16(s32)
+    %18:_(s32) = G_FREEZE %17
+    %19:_(s32) = G_CONSTANT i32 2
+    %20:_(s32) = G_SHL %18, %19(s32)
+    %21:_(s32) = G_AMDGPU_BUFFER_LOAD %13(<4 x s32>), %15(s32), %20, %15, 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+    %22:_(s1) = G_ICMP intpred(eq), %21(s32), %15
+    %23:_(s32) = G_CONSTANT i32 1
+    %24:_(s32) = G_AND %18, %23
+    %25:_(s1) = G_TRUNC %24(s32)
+    %26:_(s1) = G_CONSTANT i1 true
+    %27:_(s1) = G_XOR %25, %26
+    G_BRCOND %27(s1), %bb.2
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %28:_(s32) = G_CONSTANT i32 0
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %29:_(s32) = G_PHI %30(s32), %bb.4, %3(s32), %bb.0
+    %31:_(s1) = G_PHI %32(s1), %bb.4, %26(s1), %bb.0
+    G_BRCOND %31(s1), %bb.5
+    G_BR %bb.6
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.3(0x7c000000)
+
+    %33:_(s32) = G_PHI %34(s32), %bb.3, %28(s32), %bb.1
+    %35:_(s32) = G_PHI %36(s32), %bb.3, %18(s32), %bb.1
+    %37:_(s32) = G_PHI %38(s32), %bb.3, %28(s32), %bb.1
+    %39:_(s32) = G_CONSTANT i32 0
+    %40:_(s32) = G_AMDGPU_BUFFER_LOAD %13(<4 x s32>), %39(s32), %33, %39, 0, 0, 0 :: (load (s32), align 1, addrspace 8)
+    %38:_(s32) = G_ADD %40, %37
+    %41:_(s32) = G_CONSTANT i32 -1
+    %36:_(s32) = G_ADD %35, %41
+    %42:_(s32) = G_CONSTANT i32 4
+    %34:_(s32) = G_ADD %33, %42
+    %43:_(s1) = G_ICMP intpred(ne), %36(s32), %39
+    G_BRCOND %43(s1), %bb.3
+    G_BR %bb.4
+
+  bb.4:
+    successors: %bb.2(0x80000000)
+
+    %44:_(s32) = G_PHI %38(s32), %bb.3
+    %32:_(s1) = G_CONSTANT i1 false
+    %45:_(s1) = G_ICMP intpred(eq), %44(s32), %21
+    %46:_(s1) = G_OR %22, %45
+    %30:_(s32) = G_ZEXT %46(s1)
+    G_BR %bb.2
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+
+    %47:_(s32) = G_ZEXT %22(s1)
+    %48:_(s32) = G_CONSTANT i32 2
+    %49:_(s32) = G_OR %47, %48
+
+  bb.6:
+    %50:_(s32) = G_PHI %29(s32), %bb.2, %49(s32), %bb.5
+    %51:_(<4 x s32>), %52:_(<4 x s32>) = G_UNMERGE_VALUES %10(<8 x s32>)
+    %53:_(s32) = G_ADD %2, %1
+    %54:_(s32) = G_CONSTANT i32 2
+    %55:_(s32) = G_SHL %53, %54(s32)
+    %56:_(s32) = G_CONSTANT i32 0
+    G_AMDGPU_BUFFER_STORE %50(s32), %52(<4 x s32>), %56(s32), %55, %56, 0, 0, 0 :: (store (s32), align 1, addrspace 8)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
new file mode 100644
index 0000000000000..7a68b94eb9a3a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -0,0 +1,512 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+; This file contains various tests that have divergent i1s used outside of
+; the loop. These are lane masks is sgpr and need to have correct value in
+; corresponding bit at the iteration lane exits the loop.
+; Achieved by merging lane mask with same lane mask from previous iteration
+; and using that merged lane mask outside of the loop.
+
+; Phi used outside of the loop directly (loopfinder will figure out that it
+; needs to merge lane mask across all iterations)
+define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) {
+; GFX10-LABEL: divergent_i1_phi_used_outside_loop:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:  .LBB0_1: ; %loop
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v6, v1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v0
+; GFX10-NEXT:    v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %pre.cond = fcmp ogt float %pre.cond.val, 1.0
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+  %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ]
+  %neg.bool.counter = xor i1 %bool.counter, true
+  %f.counter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %f.counter, %val
+  %counter.plus.1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %select = select i1 %bool.counter, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr addrspace(1) %a, ptr %addr) {
+; GFX10-LABEL: divergent_i1_phi_used_outside_loop_larger_loop_body:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, -1
+; GFX10-NEXT:    v_mov_b32_e32 v5, 1
+; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_branch .LBB1_2
+; GFX10-NEXT:  .LBB1_1: ; %loop.cond
+; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, 4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
+; GFX10-NEXT:    v_cmp_le_i32_e32 vcc_lo, 10, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    s_cbranch_vccz .LBB1_4
+; GFX10-NEXT:  .LBB1_2: ; %loop.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v5
+; GFX10-NEXT:    s_mov_b32 s6, s5
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s5
+; GFX10-NEXT:    s_cbranch_execz .LBB1_1
+; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
+; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
+; GFX10-NEXT:    global_load_dword v5, v[1:2], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v5
+; GFX10-NEXT:    s_branch .LBB1_1
+; GFX10-NEXT:  .LBB1_4: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
+; GFX10-NEXT:    flat_store_dword v[3:4], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop.start
+
+loop.start:
+  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+  %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero, %loop.cond ]
+  br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond
+
+is.eq.zero:
+  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+  %elt.i = load i32, ptr addrspace(1) %a.plus.i
+  %elt.i.eq.zero = icmp eq i32 %elt.i, 0
+  br label %loop.cond
+
+loop.cond:
+  %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ]
+  %cond = icmp slt i32 %i, 10
+  %i.plus.1 = add i32 %i, 1
+  br i1 %cond, label %exit, label %loop.start
+
+exit:
+  %select = select i1 %all.eq.zero, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+; Non-phi used outside of the loop
+
+define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val, ptr %addr) {
+; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:  .LBB2_1: ; %loop
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT:    v_xor_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    flat_store_dword v[2:3], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %pre.cond = fcmp ogt float %pre.cond.val, 1.0
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+  %bool.counter = phi i1 [ %pre.cond, %entry ], [ %neg.bool.counter, %loop ]
+  %neg.bool.counter = xor i1 %bool.counter, true
+  %f.counter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %f.counter, %val
+  %counter.plus.1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %select = select i1 %neg.bool.counter, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+;void xor(int num_elts, int* a, int* addr) {
+;for(int i=0; i<num_elts; ++i) {
+;  if(a[i]==0)
+;    return;
+;}
+;addr[0] = 5
+;return;
+;}
+
+define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts, ptr addrspace(1) %a, ptr %addr) {
+; GFX10-LABEL: divergent_i1_xor_used_outside_loop_larger_loop_body:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB3_6
+; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_branch .LBB3_3
+; GFX10-NEXT:  .LBB3_2: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_xor_b32 s7, s7, 1
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s5, s6, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execz .LBB3_5
+; GFX10-NEXT:  .LBB3_3: ; %loop.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
+; GFX10-NEXT:    global_load_dword v6, v[6:7], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB3_2
+; GFX10-NEXT:  ; %bb.4: ; %loop.cond
+; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v5
+; GFX10-NEXT:    v_cmp_lt_i32_e64 s6, v5, v0
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    s_branch .LBB3_2
+; GFX10-NEXT:  .LBB3_5: ; %loop.exit.guard
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_and_b32 s5, 1, s7
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s5
+; GFX10-NEXT:  .LBB3_6: ; %Flow1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
+; GFX10-NEXT:    s_cbranch_execz .LBB3_8
+; GFX10-NEXT:  ; %bb.7: ; %block.after.loop
+; GFX10-NEXT:    v_mov_b32_e32 v0, 5
+; GFX10-NEXT:    flat_store_dword v[3:4], v0
+; GFX10-NEXT:  .LBB3_8: ; %exit
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %start.cond = icmp eq i32 %num.elts, 0
+  br i1 %start.cond, label %loop.start, label %block.after.loop
+
+loop.start:
+  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+  %elt.i = load i32, ptr addrspace(1) %a.plus.i
+  %elt.i.eq.zero = icmp eq i32 %elt.i, 0
+  br i1 %elt.i.eq.zero, label %exit, label %loop.cond
+
+loop.cond:
+  %cond = icmp slt i32 %i, %num.elts
+  %i.plus.1 = add i32 %i, 1
+  br i1 %cond, label %block.after.loop, label %loop.start
+
+block.after.loop:
+  store i32 5, ptr %addr
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+;void icmp(int num_elts, int* a, int* addr) {
+;for(;;) {
+;  if(a[i]==0)
+;    return;
+;}
+;addr[0] = 5
+;return;
+;}
+
+define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr %addr) {
+; GFX10-LABEL: divergent_i1_icmp_used_outside_loop:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    s_branch .LBB4_2
+; GFX10-NEXT:  .LBB4_1: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
+; GFX10-NEXT:    s_and_b32 s4, 1, s6
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    s_or_b32 s5, s4, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execz .LBB4_6
+; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_mov_b32_e32 v4, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
+; GFX10-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_4
+; GFX10-NEXT:  ; %bb.3: ; %if.block.0
+; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], 2, v[4:5]
+; GFX10-NEXT:    v_add_co_u32 v8, s4, v2, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v9, s4, v3, v9, s4
+; GFX10-NEXT:    global_store_dword v[8:9], v4, off
+; GFX10-NEXT:  .LBB4_4: ; %loop.break.block
+; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
+; GFX10-NEXT:    s_mov_b32 s6, 1
+; GFX10-NEXT:    ; implicit-def: $vgpr5
+; GFX10-NEXT:    s_and_saveexec_b32 s7, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB4_1
+; GFX10-NEXT:  ; %bb.5: ; %loop.cond
+; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v4
+; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_branch .LBB4_1
+; GFX10-NEXT:  .LBB4_6: ; %cond.block.1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_8
+; GFX10-NEXT:  ; %bb.7: ; %if.block.1
+; GFX10-NEXT:    global_store_dword v[6:7], v4, off
+; GFX10-NEXT:  .LBB4_8: ; %exit
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop.start
+
+loop.start:
+  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+  br label %cond.block.0
+
+cond.block.0:
+  %cond.0 = icmp eq i32 %v0, %i
+  br i1 %cond.0, label %if.block.0, label %loop.break.block
+
+if.block.0:
+  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+  store i32 %i, ptr addrspace(1) %a.plus.i
+  br label %loop.break.block
+
+loop.break.block:
+  %cond.1 = icmp eq i32 %v1, %i
+  br i1 %cond.1, label %cond.block.1, label %loop.cond
+
+loop.cond:
+  ; no cond, infinite loop with one break
+  %i.plus.1 = add i32 %i, 1
+  br label %loop.start
+
+cond.block.1:
+  %cond.2 = icmp eq i32 %v0, %i
+  br i1 %cond.2, label %if.block.1, label %exit
+
+if.block.1:
+  store i32 %i, ptr addrspace(1) %c
+  br label %exit
+
+exit:
+  ret void
+}
+
+
+; bool all_eq_zero = true;
+; i32 i = 0;
+; do {
+;   if(all_eq_zero)
+;     all_eq_zero = (a[i] == 0);
+;
+;   i += 1;
+; } while ( i < n )
+
+; *addr = all_eq_zero ? 1.0 : 0.0;
+
+; check that all elements in an array of size n are zero, loop has divergent
+; exit condition based on array size, but zero check does not break out of the
+; loop but instead skips zero check in remaining iterations
+; llpc "freezes" zero check since it is (via phi) used in a conditional branch
+define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspace(1) %a, ptr %addr) {
+; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, 1
+; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    s_branch .LBB5_2
+; GFX10-NEXT:  .LBB5_1: ; %loop.cond
+; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v5
+; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v5, v7
+; GFX10-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB5_4
+; GFX10-NEXT:  .LBB5_2: ; %loop.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v6
+; GFX10-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-NEXT:    s_cbranch_execz .LBB5_1
+; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
+; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
+; GFX10-NEXT:    global_load_dword v6, v[6:7], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v6
+; GFX10-NEXT:    s_branch .LBB5_1
+; GFX10-NEXT:  .LBB5_4: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s1
+; GFX10-NEXT:    flat_store_dword v[3:4], v0
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %loop.start
+
+loop.start:
+  %i = phi i32 [ 0, %entry ], [ %i.plus.1, %loop.cond ]
+  %all.eq.zero = phi i1 [ true, %entry ], [ %eq.zero.fr, %loop.cond ]
+  br i1 %all.eq.zero, label %is.eq.zero, label %loop.cond
+
+is.eq.zero:
+  %a.plus.i = getelementptr i32, ptr addrspace(1) %a, i32 %i
+  %elt.i = load i32, ptr addrspace(1) %a.plus.i
+  %elt.i.eq.zero = icmp eq i32 %elt.i, 0
+  br label %loop.cond
+
+loop.cond:
+  %eq.zero = phi i1 [ %all.eq.zero, %loop.start ], [ %elt.i.eq.zero, %is.eq.zero ]
+  %eq.zero.fr = freeze i1 %eq.zero
+  %cond = icmp slt i32 %i, %n
+  %i.plus.1 = add i32 %i, 1
+  br i1 %cond, label %exit, label %loop.start
+
+exit:
+  %select = select i1 %eq.zero.fr, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+; Divergent i1 phi from structurize-cfg used outside of the loop
+define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 addrspace(1)* %a, i32 addrspace(1)* %a.break) {
+; GFX10-LABEL: loop_with_1break:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s0
+; GFX10-NEXT:    s_branch .LBB6_2
+; GFX10-NEXT:  .LBB6_1: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_and_b32 s1, 1, s3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB6_4
+; GFX10-NEXT:  .LBB6_2: ; %A
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; GFX10-NEXT:    global_load_dword v9, v[9:10], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB6_1
+; GFX10-NEXT:  ; %bb.3: ; %loop.body
+; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT:    global_store_dword v[7:8], v9, off
+; GFX10-NEXT:    s_branch .LBB6_1
+; GFX10-NEXT:  .LBB6_4: ; %loop.exit.guard
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
+; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB6_6
+; GFX10-NEXT:  ; %bb.5: ; %break.body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 10
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:  .LBB6_6: ; %exit
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %counter
+  %a.val = load i32, i32 addrspace(1)* %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %break.body, label %loop.body
+
+break.body:
+  store i32 10, i32 addrspace(1)* %a.break
+  br label %exit
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %counter
+  %x.val = load i32, i32 addrspace(1)* %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, i32 addrspace(1)* %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, 100
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
new file mode 100644
index 0000000000000..671ccbdc2290c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -0,0 +1,914 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: divergent_i1_phi_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_used_outside_loop
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 0
+    %6:_(s32) = G_FCONSTANT float 1.000000e+00
+    %7:_(s1) = G_FCMP floatpred(ogt), %1(s32), %6
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %8:_(s32) = G_PHI %9(s32), %bb.1, %5(s32), %bb.0
+    %10:_(s32) = G_PHI %5(s32), %bb.0, %11(s32), %bb.1
+    %12:_(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.1
+    %14:_(s1) = G_CONSTANT i1 true
+    %13:_(s1) = G_XOR %12, %14
+    %15:_(s32) = G_UITOFP %10(s32)
+    %16:_(s1) = G_FCMP floatpred(ogt), %15(s32), %0
+    %17:_(s32) = G_CONSTANT i32 1
+    %11:_(s32) = G_ADD %10, %17
+    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
+    SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %18:_(s1) = G_PHI %12(s1), %bb.1
+    %19:_(s32) = G_PHI %9(s32), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
+    %20:_(s32) = G_FCONSTANT float 0.000000e+00
+    %21:_(s32) = G_FCONSTANT float 1.000000e+00
+    %22:_(s32) = G_SELECT %18(s1), %21, %20
+    G_STORE %22(s32), %4(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: divergent_i1_phi_used_outside_loop_larger_loop_body
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_used_outside_loop_larger_loop_body
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %13(s1), %bb.3
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI1]], [[C3]](s64)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]]
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[ADD]](s32), [[C5]]
+  ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.1
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.3
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C7]], [[C6]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    %0:_(s32) = COPY $vgpr1
+    %1:_(s32) = COPY $vgpr2
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr3
+    %4:_(s32) = COPY $vgpr4
+    %5:_(p0) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_CONSTANT i32 -1
+    %7:_(s1) = G_CONSTANT i1 true
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %8:_(s32) = G_PHI %6(s32), %bb.0, %9(s32), %bb.3
+    %10:_(p1) = G_PHI %2(p1), %bb.0, %11(p1), %bb.3
+    %12:sreg_32_xm0_xexec(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.3
+    %14:sreg_32_xm0_xexec(s32) = SI_IF %12(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %15:_(s32) = G_LOAD %10(p1) :: (load (s32), addrspace 1)
+    %16:_(s32) = G_CONSTANT i32 0
+    %17:_(s1) = G_ICMP intpred(eq), %15(s32), %16
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %13:_(s1) = G_PHI %17(s1), %bb.2, %12(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    %18:_(s64) = G_CONSTANT i64 4
+    %11:_(p1) = G_PTR_ADD %10, %18(s64)
+    %19:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = nsw G_ADD %8, %19
+    %20:_(s32) = G_CONSTANT i32 10
+    %21:_(s1) = G_ICMP intpred(sge), %9(s32), %20
+    G_BRCOND %21(s1), %bb.1
+    G_BR %bb.4
+
+  bb.4:
+    %22:_(s1) = G_PHI %12(s1), %bb.3
+    %23:_(s32) = G_FCONSTANT float 0.000000e+00
+    %24:_(s32) = G_FCONSTANT float 1.000000e+00
+    %25:_(s32) = G_SELECT %22(s1), %24, %23
+    G_STORE %25(s32), %5(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: divergent_i1_xor_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_xor_used_outside_loop
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p0) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = G_CONSTANT i32 0
+    %6:_(s32) = G_FCONSTANT float 1.000000e+00
+    %7:_(s1) = G_FCMP floatpred(ogt), %1(s32), %6
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %8:_(s32) = G_PHI %9(s32), %bb.1, %5(s32), %bb.0
+    %10:_(s32) = G_PHI %5(s32), %bb.0, %11(s32), %bb.1
+    %12:_(s1) = G_PHI %7(s1), %bb.0, %13(s1), %bb.1
+    %14:_(s1) = G_CONSTANT i1 true
+    %13:_(s1) = G_XOR %12, %14
+    %15:_(s32) = G_UITOFP %10(s32)
+    %16:_(s1) = G_FCMP floatpred(ogt), %15(s32), %0
+    %17:_(s32) = G_CONSTANT i32 1
+    %11:_(s32) = G_ADD %10, %17
+    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %16(s1), %8(s32)
+    SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %18:_(s1) = G_PHI %13(s1), %bb.1
+    %19:_(s32) = G_PHI %9(s32), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %19(s32)
+    %20:_(s32) = G_FCONSTANT float 0.000000e+00
+    %21:_(s32) = G_FCONSTANT float 1.000000e+00
+    %22:_(s32) = G_SELECT %18(s1), %21, %20
+    G_STORE %22(s32), %4(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: divergent_i1_xor_used_outside_loop_larger_loop_body
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_xor_used_outside_loop_larger_loop_body
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, [[C1]](s1), %bb.0
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.7(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI2]](s32), [[COPY]]
+  ; GFX10-NEXT:   G_BR %bb.7
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
+  ; GFX10-NEXT:   G_STORE [[C8]](s32), [[MV1]](p0) :: (store (s32))
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   SI_RETURN
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.7:
+  ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.3(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[C6]](s1), %bb.4, [[C3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C3]](s1), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI5]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.8
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.8:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.7
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   G_BR %bb.2
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s32) = COPY $vgpr4
+    %6:_(p0) = G_MERGE_VALUES %4(s32), %5(s32)
+    %7:_(s32) = G_IMPLICIT_DEF
+    %8:_(s32) = G_CONSTANT i32 0
+    %9:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %8
+    %10:_(s1) = G_CONSTANT i1 true
+    %11:sreg_32_xm0_xexec(s32) = SI_IF %9(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %12:_(s32) = G_CONSTANT i32 0
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %13:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, %10(s1), %bb.0
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.5
+
+  bb.3:
+    successors: %bb.4(0x40000000), %bb.7(0x40000000)
+
+    %16:_(s32) = G_PHI %12(s32), %bb.1, %17(s32), %bb.7
+    %18:_(s32) = G_PHI %19(s32), %bb.7, %12(s32), %bb.1
+    %20:_(s1) = G_CONSTANT i1 true
+    %21:_(s64) = G_SEXT %18(s32)
+    %22:_(s32) = G_CONSTANT i32 2
+    %23:_(s64) = G_SHL %21, %22(s32)
+    %24:_(p1) = G_PTR_ADD %3, %23(s64)
+    %25:_(s32) = G_LOAD %24(p1) :: (load (s32), addrspace 1)
+    %26:_(s32) = G_CONSTANT i32 0
+    %27:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %25(s32), %26
+    %28:sreg_32_xm0_xexec(s32) = SI_IF %27(s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.4:
+    successors: %bb.7(0x80000000)
+
+    %29:_(s1) = G_CONSTANT i1 false
+    %30:_(s32) = G_CONSTANT i32 1
+    %31:_(s32) = G_ADD %18, %30
+    %32:_(s1) = G_ICMP intpred(slt), %18(s32), %0
+    G_BR %bb.7
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+
+    %33:_(s32) = G_CONSTANT i32 5
+    G_STORE %33(s32), %6(p0) :: (store (s32))
+
+  bb.6:
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    SI_RETURN
+
+  bb.7:
+    successors: %bb.8(0x04000000), %bb.3(0x7c000000)
+
+    %19:_(s32) = G_PHI %31(s32), %bb.4, %7(s32), %bb.3
+    %34:_(s1) = G_PHI %29(s1), %bb.4, %20(s1), %bb.3
+    %35:_(s1) = G_PHI %32(s1), %bb.4, %20(s1), %bb.3
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %28(s32)
+    %36:_(s1) = G_CONSTANT i1 true
+    %37:_(s1) = G_XOR %34, %36
+    %17:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %35(s1), %16(s32)
+    SI_LOOP %17(s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.8
+
+  bb.8:
+    successors: %bb.2(0x80000000)
+
+    %14:_(s1) = G_PHI %37(s1), %bb.7
+    %38:_(s32) = G_PHI %17(s32), %bb.7
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
+    G_BR %bb.2
+...
+
+---
+name: divergent_i1_icmp_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_icmp_used_outside_loop
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI1]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
+  ; GFX10-NEXT:   G_STORE [[PHI1]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C3]](s1), %bb.5, [[C2]](s1), %bb.4
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.7
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.7:
+  ; GFX10-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.6
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.8
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.8:
+  ; GFX10-NEXT:   successors: %bb.9(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   G_STORE [[PHI6]](s32), [[MV1]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.9:
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(p1) = G_MERGE_VALUES %2(s32), %3(s32)
+    %5:_(s32) = COPY $vgpr6
+    %6:_(s32) = COPY $vgpr7
+    %7:_(p1) = G_MERGE_VALUES %5(s32), %6(s32)
+    %8:_(s32) = G_CONSTANT i32 0
+    %9:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %10:_(s32) = G_PHI %11(s32), %bb.6, %8(s32), %bb.0
+    %12:_(s32) = G_PHI %8(s32), %bb.0, %13(s32), %bb.6
+
+  bb.2:
+    successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+    %14:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %0(s32), %12
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %14(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.3
+
+  bb.3:
+    successors: %bb.4(0x80000000)
+
+    %16:_(s64) = G_SEXT %12(s32)
+    %17:_(s32) = G_CONSTANT i32 2
+    %18:_(s64) = G_SHL %16, %17(s32)
+    %19:_(p1) = G_PTR_ADD %4, %18(s64)
+    G_STORE %12(s32), %19(p1) :: (store (s32), addrspace 1)
+
+  bb.4:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %20:_(s1) = G_CONSTANT i1 true
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    %21:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %1(s32), %12
+    %22:sreg_32_xm0_xexec(s32) = SI_IF %21(s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.5
+
+  bb.5:
+    successors: %bb.6(0x80000000)
+
+    %23:_(s1) = G_CONSTANT i1 false
+    %24:_(s32) = G_CONSTANT i32 1
+    %25:_(s32) = G_ADD %12, %24
+
+  bb.6:
+    successors: %bb.7(0x04000000), %bb.1(0x7c000000)
+
+    %13:_(s32) = G_PHI %25(s32), %bb.5, %9(s32), %bb.4
+    %26:_(s1) = G_PHI %23(s1), %bb.5, %20(s1), %bb.4
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %22(s32)
+    %11:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %26(s1), %10(s32)
+    SI_LOOP %11(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.7
+
+  bb.7:
+    successors: %bb.8(0x40000000), %bb.9(0x40000000)
+
+    %27:_(s32) = G_PHI %11(s32), %bb.6
+    %28:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.6
+    %29:_(s32) = G_PHI %12(s32), %bb.6
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+    %30:sreg_32_xm0_xexec(s32) = SI_IF %28(s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.8
+
+  bb.8:
+    successors: %bb.9(0x80000000)
+
+    G_STORE %29(s32), %7(p1) :: (store (s32), addrspace 1)
+
+  bb.9:
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
+    SI_RETURN
+...
+
+---
+name: divergent_i1_freeze_used_outside_loop
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_freeze_used_outside_loop
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %14(s1), %bb.3
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[PHI3]]
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI1]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[FREEZE]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C6]], [[C5]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p1) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s32) = COPY $vgpr4
+    %6:_(p0) = G_MERGE_VALUES %4(s32), %5(s32)
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:_(s1) = G_CONSTANT i1 true
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %9:_(s32) = G_PHI %10(s32), %bb.3, %7(s32), %bb.0
+    %11:_(s32) = G_PHI %7(s32), %bb.0, %12(s32), %bb.3
+    %13:sreg_32_xm0_xexec(s1) = G_PHI %8(s1), %bb.0, %14(s1), %bb.3
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %13(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %16:_(s64) = G_SEXT %11(s32)
+    %17:_(s32) = G_CONSTANT i32 2
+    %18:_(s64) = G_SHL %16, %17(s32)
+    %19:_(p1) = G_PTR_ADD %3, %18(s64)
+    %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+    %21:_(s32) = G_CONSTANT i32 0
+    %22:_(s1) = G_ICMP intpred(eq), %20(s32), %21
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %23:_(s1) = G_PHI %22(s1), %bb.2, %13(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    %14:_(s1) = G_FREEZE %23
+    %24:_(s32) = G_CONSTANT i32 1
+    %12:_(s32) = G_ADD %11, %24
+    %25:_(s1) = G_ICMP intpred(slt), %11(s32), %0
+    %10:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %25(s1), %9(s32)
+    SI_LOOP %10(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.4:
+    %26:_(s1) = G_PHI %14(s1), %bb.3
+    %27:_(s32) = G_PHI %10(s32), %bb.3
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+    %28:_(s32) = G_FCONSTANT float 0.000000e+00
+    %29:_(s32) = G_FCONSTANT float 1.000000e+00
+    %30:_(s32) = G_SELECT %26(s1), %29, %28
+    G_STORE %30(s32), %6(p0) :: (store (s32))
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_1break
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: loop_with_1break
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GFX10-NEXT:   G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+  ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
+  ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.3(0x40000000), %bb.5(0x40000000)
+
+    %11:_(s32) = G_PHI %12(s32), %bb.5, %9(s32), %bb.0
+    %13:_(s32) = G_PHI %9(s32), %bb.0, %14(s32), %bb.5
+    %15:_(s1) = G_CONSTANT i1 true
+    %16:_(s64) = G_SEXT %13(s32)
+    %17:_(s32) = G_CONSTANT i32 2
+    %18:_(s64) = G_SHL %16, %17(s32)
+    %19:_(p1) = G_PTR_ADD %5, %18(s64)
+    %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+    %21:_(s32) = G_CONSTANT i32 0
+    %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
+    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.4(0x80000000)
+
+    %24:_(s32) = G_CONSTANT i32 10
+    G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1)
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.5(0x80000000)
+
+    %25:_(s1) = G_CONSTANT i1 false
+    %26:_(s32) = G_CONSTANT i32 2
+    %27:_(s64) = G_SHL %16, %26(s32)
+    %28:_(p1) = G_PTR_ADD %2, %27(s64)
+    %29:_(s32) = G_LOAD %28(p1) :: (load (s32), addrspace 1)
+    %30:_(s32) = G_CONSTANT i32 1
+    %31:_(s32) = G_ADD %29, %30
+    G_STORE %31(s32), %28(p1) :: (store (s32), addrspace 1)
+    %32:_(s32) = G_ADD %13, %30
+    %33:_(s32) = G_CONSTANT i32 100
+    %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33
+    G_BR %bb.5
+
+  bb.4:
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+    S_ENDPGM 0
+
+  bb.5:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1
+    %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1
+    %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
+    SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.6:
+    successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+    %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
+    %39:_(s32) = G_PHI %12(s32), %bb.5
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
+    %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
new file mode 100644
index 0000000000000..6f60adf2f7ab6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -0,0 +1,461 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+; Simples case, if - then, that requires lane mask merging,
+; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
+; will overwrite its own lane bit in lane mask with val_B
+define amdgpu_ps void @divergent_i1_phi_if_then(i32 addrspace(1)* %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: divergent_i1_phi_if_then:
+; GFX10:       ; %bb.0: ; %A
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+A:
+  %val_A = icmp uge i32 %tid, 6
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %B, label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 1
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %sel = select i1 %phi, i32 1, i32 2
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; if - else
+define amdgpu_ps void @divergent_i1_phi_if_else(i32 addrspace(1)* %out, i32 %tid, i32 %cond) {
+; GFX10-LABEL: divergent_i1_phi_if_else:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_and_b32 s0, 1, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 2, v2
+; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:  ; %bb.2: ; %Flow
+; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
+; GFX10-NEXT:  ; %bb.3: ; %A
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
+; GFX10-NEXT:  ; %bb.4: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    global_store_dword v[0:1], v2, off
+; GFX10-NEXT:    s_endpgm
+entry:
+  %cmp = icmp eq i32 %cond, 0
+  br i1 %cmp, label %A, label %B
+
+A:
+  %val_A = icmp uge i32 %tid, 1
+  br label %exit
+
+B:
+  %val_B = icmp ult i32 %tid, 2
+  br label %exit
+
+exit:
+  %phi = phi i1 [ %val_A, %A ], [ %val_B, %B ]
+  %sel = select i1 %phi, i32 1, i32 2
+  store i32 %sel, i32 addrspace(1)* %out
+  ret void
+}
+
+; if - break;
+
+;  counter = 0;
+;  do {
+;    if (a[counter] == 0)
+;      break;
+;    if (b[counter] == 0)
+;      break;
+;    if (c[counter] == 0)
+;      break;
+;    x[counter++]+=1;
+;  } while (counter<100);
+
+; Tests with multiple break conditions. Divergent phis will be used to track
+; if any of the break conditions was reached. We only need to do simple lane
+; mask merging (for current loop iteration only). There is an intrinsic,
+; if_break, that will merge lane masks across all iterations of the loop.
+
+define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 addrspace(1)* %a) {
+; GFX10-LABEL: loop_with_1break:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s0
+; GFX10-NEXT:    s_branch .LBB2_2
+; GFX10-NEXT:  .LBB2_1: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB2_4
+; GFX10-NEXT:  .LBB2_2: ; %A
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 2, v[4:5]
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v2, v5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
+; GFX10-NEXT:    global_load_dword v7, v[7:8], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB2_1
+; GFX10-NEXT:  ; %bb.3: ; %loop.body
+; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v0, v5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v4
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v4
+; GFX10-NEXT:    global_load_dword v7, v[5:6], off
+; GFX10-NEXT:    v_mov_b32_e32 v4, v8
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v7
+; GFX10-NEXT:    global_store_dword v[5:6], v7, off
+; GFX10-NEXT:    s_branch .LBB2_1
+; GFX10-NEXT:  .LBB2_4: ; %exit
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %counter
+  %a.val = load i32, i32 addrspace(1)* %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %exit, label %loop.body
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %counter
+  %x.val = load i32, i32 addrspace(1)* %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, i32 addrspace(1)* %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, 100
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
+define amdgpu_cs void @loop_with_2breaks(i32 addrspace(1)* %x, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+; GFX10-LABEL: loop_with_2breaks:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s0
+; GFX10-NEXT:    s_branch .LBB3_3
+; GFX10-NEXT:  .LBB3_1: ; %Flow3
+; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:  .LBB3_2: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB3_6
+; GFX10-NEXT:  .LBB3_3: ; %A
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; GFX10-NEXT:    global_load_dword v9, v[9:10], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB3_2
+; GFX10-NEXT:  ; %bb.4: ; %B
+; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX10-NEXT:    global_load_dword v9, v[9:10], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB3_1
+; GFX10-NEXT:  ; %bb.5: ; %loop.body
+; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT:    global_store_dword v[7:8], v9, off
+; GFX10-NEXT:    s_branch .LBB3_1
+; GFX10-NEXT:  .LBB3_6: ; %exit
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %counter
+  %a.val = load i32, i32 addrspace(1)* %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %exit, label %B
+
+B:
+  %b.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %b, i32 %counter
+  %b.val = load i32, i32 addrspace(1)* %b.plus.counter
+  %b.cond = icmp eq i32 %b.val, 0
+  br i1 %b.cond, label %exit, label %loop.body
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %counter
+  %x.val = load i32, i32 addrspace(1)* %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, i32 addrspace(1)* %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, 100
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
+define amdgpu_cs void @loop_with_3breaks(i32 addrspace(1)* %x, i32 addrspace(1)* %a, i32 addrspace(1)* %b, i32 addrspace(1)* %c) {
+; GFX10-LABEL: loop_with_3breaks:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v8, s0
+; GFX10-NEXT:    s_branch .LBB4_4
+; GFX10-NEXT:  .LBB4_1: ; %Flow5
+; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:  .LBB4_2: ; %Flow4
+; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:  .LBB4_3: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB4_8
+; GFX10-NEXT:  .LBB4_4: ; %A
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    v_lshlrev_b64 v[9:10], 2, v[8:9]
+; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v2, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
+; GFX10-NEXT:    global_load_dword v11, v[11:12], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_3
+; GFX10-NEXT:  ; %bb.5: ; %B
+; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
+; GFX10-NEXT:    global_load_dword v11, v[11:12], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_2
+; GFX10-NEXT:  ; %bb.6: ; %C
+; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT:    global_load_dword v11, v[11:12], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB4_1
+; GFX10-NEXT:  ; %bb.7: ; %loop.body
+; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v8
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v8
+; GFX10-NEXT:    global_load_dword v11, v[9:10], off
+; GFX10-NEXT:    v_mov_b32_e32 v8, v12
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v11
+; GFX10-NEXT:    global_store_dword v[9:10], v11, off
+; GFX10-NEXT:    s_branch .LBB4_1
+; GFX10-NEXT:  .LBB4_8: ; %exit
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %counter
+  %a.val = load i32, i32 addrspace(1)* %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %exit, label %B
+
+B:
+  %b.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %b, i32 %counter
+  %b.val = load i32, i32 addrspace(1)* %b.plus.counter
+  %b.cond = icmp eq i32 %b.val, 0
+  br i1 %b.cond, label %exit, label %C
+
+C:
+  %c.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %c, i32 %counter
+  %c.val = load i32, i32 addrspace(1)* %c.plus.counter
+  %c.cond = icmp eq i32 %c.val, 0
+  br i1 %c.cond, label %exit, label %loop.body
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %counter
+  %x.val = load i32, i32 addrspace(1)* %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, i32 addrspace(1)* %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, 100
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
+; Divergent condition if with body, ending with break. This is loop with two
+; exits but structurizer will create phi that will track exit from break
+; and move break.body after the loop. Loop will then have one exit and phi
+; used outside of the loop by condition used to enter the break.body.
+define amdgpu_cs void @loop_with_div_break_with_body(i32 addrspace(1)* %x, i32 addrspace(1)* %a, i32 addrspace(1)* %a.break) {
+; GFX10-LABEL: loop_with_div_break_with_body:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v6, s0
+; GFX10-NEXT:    s_branch .LBB5_2
+; GFX10-NEXT:  .LBB5_1: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_and_b32 s1, 1, s3
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB5_4
+; GFX10-NEXT:  .LBB5_2: ; %A
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
+; GFX10-NEXT:    global_load_dword v9, v[9:10], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
+; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_cbranch_execz .LBB5_1
+; GFX10-NEXT:  ; %bb.3: ; %loop.body
+; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
+; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
+; GFX10-NEXT:    global_store_dword v[7:8], v9, off
+; GFX10-NEXT:    s_branch .LBB5_1
+; GFX10-NEXT:  .LBB5_4: ; %loop.exit.guard
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX10-NEXT:    s_and_saveexec_b32 s0, s1
+; GFX10-NEXT:    s_xor_b32 s0, exec_lo, s0
+; GFX10-NEXT:    s_cbranch_execz .LBB5_6
+; GFX10-NEXT:  ; %bb.5: ; %break.body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 10
+; GFX10-NEXT:    global_store_dword v[4:5], v0, off
+; GFX10-NEXT:  .LBB5_6: ; %exit
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %counter
+  %a.val = load i32, i32 addrspace(1)* %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %break.body, label %loop.body
+
+break.body:
+  store i32 10, i32 addrspace(1)* %a.break
+  br label %exit
+
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %counter
+  %x.val = load i32, i32 addrspace(1)* %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, i32 addrspace(1)* %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, 100
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
+; Snippet from test generated by the GraphicsFuzz tool, frontend generates ir
+; with irreducible control flow graph. FixIrreducible converts it into natural
+; loop and in the process creates i1 phi with three incoming values.
+
+; int loop(int x, int y, int a0, int a1, int a2, int a3, int a4) {
+;   do {
+;     if (y < a2) {
+;       do {
+;       } while (x < a2);
+;     }
+;     if (x < a3) {
+;       return a1;
+;     }
+;   } while (y < a2);
+;   return a0;
+; }
+
+; This test is also interesting because it has phi with three incomings
+;define amdgpu_ps i32 @irreducible_cfg(i32 %x, i32 %y, i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+;.entry:
+; %.y_lt_a2 = icmp sgt i32 %a2, %y
+; %.x_lt_a2 = icmp sgt i32 %a2, %x
+; %.x_lt_a3 = icmp sgt i32 %a3, %x
+; br i1 %.y_lt_a2, label %.preheader, label %.loopexit ; first iteration, jump to inner loop if 'y < a2' or start with 'if (x < a3)'
+;
+;.preheader: ; if (y < a2),
+; br label %.inner_loop
+;
+;.inner_loop: ; do while x < a2
+; br i1 %.x_lt_a2, label %.inner_loop, label %.loopexit
+;
+;.loopexit: ; if x < a3
+; %not.inner_loop = xor i1 %.y_lt_a2, true
+; %brmerge = select i1 %.x_lt_a3, i1 true, i1 %not.inner_loop ; exit loop if 'x < a3' or 'loop ends since !(y < a2)'
+; %.ret = select i1 %.x_lt_a3, i32 %a1, i32 %a0               ; select retrun value a1 'x < a3' or a0 'loop ends'
+; br i1 %brmerge, label %.exit, label %.preheader
+;
+;.exit:
+; ret i32 %.ret
+;}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
new file mode 100644
index 0000000000000..5f956c0f1e2e3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -0,0 +1,1004 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: divergent_i1_phi_if_then
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_if_then
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s32) = G_CONSTANT i32 6
+    %6:_(s1) = G_ICMP intpred(uge), %3(s32), %5
+    %7:_(s32) = G_CONSTANT i32 0
+    %8:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), %4(s32), %7
+    %9:sreg_32_xm0_xexec(s32) = SI_IF %8(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %10:_(s32) = G_CONSTANT i32 1
+    %11:_(s1) = G_ICMP intpred(ult), %3(s32), %10
+
+  bb.2:
+    %12:_(s1) = G_PHI %6(s1), %bb.0, %11(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %9(s32)
+    %13:_(s32) = G_CONSTANT i32 2
+    %14:_(s32) = G_CONSTANT i32 1
+    %15:_(s32) = G_SELECT %12(s1), %14, %13
+    G_STORE %15(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: divergent_i1_phi_if_else
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: divergent_i1_phi_if_else
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %10(s1), %bb.3, [[DEF]](s1), %bb.0
+  ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   G_BR %bb.1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[PHI]](s1), %bb.1, [[ICMP1]](s1), %bb.2
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI1]](s1), [[C3]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.3(0x40000000), %bb.1(0x40000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(s1) = G_IMPLICIT_DEF
+    %6:_(s32) = G_CONSTANT i32 0
+    %7:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %4(s32), %6
+    %8:sreg_32_xm0_xexec(s32) = SI_IF %7(s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.3
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+    %9:_(s1) = G_PHI %10(s1), %bb.3, %5(s1), %bb.0
+    %11:sreg_32_xm0_xexec(s32) = SI_ELSE %8(s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.4(0x80000000)
+
+    %12:_(s32) = G_CONSTANT i32 1
+    %13:_(s1) = G_ICMP intpred(uge), %3(s32), %12
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.1(0x80000000)
+
+    %14:_(s32) = G_CONSTANT i32 2
+    %10:_(s1) = G_ICMP intpred(ult), %3(s32), %14
+    G_BR %bb.1
+
+  bb.4:
+    %15:_(s1) = G_PHI %9(s1), %bb.1, %13(s1), %bb.2
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %11(s32)
+    %16:_(s32) = G_CONSTANT i32 1
+    %17:_(s32) = G_CONSTANT i32 2
+    %18:_(s32) = G_SELECT %15(s1), %16, %17
+    G_STORE %18(s32), %2(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_1break
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: loop_with_1break
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+  ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C5]]
+  ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C5]]
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C6]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.2, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = G_CONSTANT i32 0
+    %7:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %8:_(s32) = G_PHI %9(s32), %bb.3, %6(s32), %bb.0
+    %10:_(s32) = G_PHI %6(s32), %bb.0, %11(s32), %bb.3
+    %12:_(s1) = G_CONSTANT i1 true
+    %13:_(s64) = G_SEXT %10(s32)
+    %14:_(s32) = G_CONSTANT i32 2
+    %15:_(s64) = G_SHL %13, %14(s32)
+    %16:_(p1) = G_PTR_ADD %5, %15(s64)
+    %17:_(s32) = G_LOAD %16(p1) :: (load (s32), addrspace 1)
+    %18:_(s32) = G_CONSTANT i32 0
+    %19:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %17(s32), %18
+    %20:sreg_32_xm0_xexec(s32) = SI_IF %19(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %21:_(s32) = G_CONSTANT i32 2
+    %22:_(s64) = G_SHL %13, %21(s32)
+    %23:_(p1) = G_PTR_ADD %2, %22(s64)
+    %24:_(s32) = G_LOAD %23(p1) :: (load (s32), addrspace 1)
+    %25:_(s32) = G_CONSTANT i32 1
+    %26:_(s32) = G_ADD %24, %25
+    G_STORE %26(s32), %23(p1) :: (store (s32), addrspace 1)
+    %27:_(s32) = G_ADD %10, %25
+    %28:_(s32) = G_CONSTANT i32 100
+    %29:_(s1) = G_ICMP intpred(ult), %10(s32), %28
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %11:_(s32) = G_PHI %27(s32), %bb.2, %7(s32), %bb.1
+    %30:_(s1) = G_PHI %29(s1), %bb.2, %12(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %20(s32)
+    %9:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %30(s1), %8(s32)
+    SI_LOOP %9(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.4:
+    %31:_(s32) = G_PHI %9(s32), %bb.3
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_2breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: loop_with_2breaks
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+  ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %34(s1), %bb.5, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C7]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL2]](s64)
+  ; GFX10-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
+  ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
+  ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C9]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C4]](s1), %bb.2
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %11:_(s32) = G_PHI %12(s32), %bb.3, %9(s32), %bb.0
+    %13:_(s32) = G_PHI %9(s32), %bb.0, %14(s32), %bb.3
+    %15:_(s1) = G_CONSTANT i1 true
+    %16:_(s64) = G_SEXT %13(s32)
+    %17:_(s32) = G_CONSTANT i32 2
+    %18:_(s64) = G_SHL %16, %17(s32)
+    %19:_(p1) = G_PTR_ADD %5, %18(s64)
+    %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+    %21:_(s32) = G_CONSTANT i32 0
+    %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
+    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %24:_(s1) = G_CONSTANT i1 true
+    %25:_(s32) = G_CONSTANT i32 2
+    %26:_(s64) = G_SHL %16, %25(s32)
+    %27:_(p1) = G_PTR_ADD %8, %26(s64)
+    %28:_(s32) = G_LOAD %27(p1) :: (load (s32), addrspace 1)
+    %29:_(s32) = G_CONSTANT i32 0
+    %30:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %28(s32), %29
+    %31:sreg_32_xm0_xexec(s32) = SI_IF %30(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %14:_(s32) = G_PHI %32(s32), %bb.5, %10(s32), %bb.1
+    %33:_(s1) = G_PHI %34(s1), %bb.5, %15(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %33(s1), %11(s32)
+    SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    %35:_(s32) = G_CONSTANT i32 2
+    %36:_(s64) = G_SHL %16, %35(s32)
+    %37:_(p1) = G_PTR_ADD %2, %36(s64)
+    %38:_(s32) = G_LOAD %37(p1) :: (load (s32), addrspace 1)
+    %39:_(s32) = G_CONSTANT i32 1
+    %40:_(s32) = G_ADD %38, %39
+    G_STORE %40(s32), %37(p1) :: (store (s32), addrspace 1)
+    %41:_(s32) = G_ADD %13, %39
+    %42:_(s32) = G_CONSTANT i32 100
+    %43:_(s1) = G_ICMP intpred(ult), %13(s32), %42
+
+  bb.5:
+    successors: %bb.3(0x80000000)
+
+    %32:_(s32) = G_PHI %41(s32), %bb.4, %10(s32), %bb.2
+    %34:_(s1) = G_PHI %43(s1), %bb.4, %24(s1), %bb.2
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %31(s32)
+    G_BR %bb.3
+
+  bb.6:
+    %44:_(s32) = G_PHI %12(s32), %bb.3
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %44(s32)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_3breaks
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: loop_with_3breaks
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
+  ; GFX10-NEXT:   [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
+  ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %37(s1), %bb.5, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.8
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
+  ; GFX10-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD2]](s32), [[C9]]
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP2]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI %47(s1), %bb.7, [[C4]](s1), %bb.2
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.7(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C10]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL3]](s64)
+  ; GFX10-NEXT:   [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD3]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[C11]]
+  ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD3]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C11]]
+  ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C12]]
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.7:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s1) = G_PHI [[ICMP3]](s1), %bb.6, [[C7]](s1), %bb.4
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.8:
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = COPY $vgpr6
+    %10:_(s32) = COPY $vgpr7
+    %11:_(p1) = G_MERGE_VALUES %9(s32), %10(s32)
+    %12:_(s32) = G_CONSTANT i32 0
+    %13:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.2(0x40000000), %bb.3(0x40000000)
+
+    %14:_(s32) = G_PHI %15(s32), %bb.3, %12(s32), %bb.0
+    %16:_(s32) = G_PHI %12(s32), %bb.0, %17(s32), %bb.3
+    %18:_(s1) = G_CONSTANT i1 true
+    %19:_(s64) = G_SEXT %16(s32)
+    %20:_(s32) = G_CONSTANT i32 2
+    %21:_(s64) = G_SHL %19, %20(s32)
+    %22:_(p1) = G_PTR_ADD %5, %21(s64)
+    %23:_(s32) = G_LOAD %22(p1) :: (load (s32), addrspace 1)
+    %24:_(s32) = G_CONSTANT i32 0
+    %25:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %23(s32), %24
+    %26:sreg_32_xm0_xexec(s32) = SI_IF %25(s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    successors: %bb.4(0x40000000), %bb.5(0x40000000)
+
+    %27:_(s1) = G_CONSTANT i1 true
+    %28:_(s32) = G_CONSTANT i32 2
+    %29:_(s64) = G_SHL %19, %28(s32)
+    %30:_(p1) = G_PTR_ADD %8, %29(s64)
+    %31:_(s32) = G_LOAD %30(p1) :: (load (s32), addrspace 1)
+    %32:_(s32) = G_CONSTANT i32 0
+    %33:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %31(s32), %32
+    %34:sreg_32_xm0_xexec(s32) = SI_IF %33(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.8(0x04000000), %bb.1(0x7c000000)
+
+    %17:_(s32) = G_PHI %35(s32), %bb.5, %13(s32), %bb.1
+    %36:_(s1) = G_PHI %37(s1), %bb.5, %18(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %26(s32)
+    %15:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %36(s1), %14(s32)
+    SI_LOOP %15(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.8
+
+  bb.4:
+    successors: %bb.6(0x40000000), %bb.7(0x40000000)
+
+    %38:_(s1) = G_CONSTANT i1 true
+    %39:_(s32) = G_CONSTANT i32 2
+    %40:_(s64) = G_SHL %19, %39(s32)
+    %41:_(p1) = G_PTR_ADD %11, %40(s64)
+    %42:_(s32) = G_LOAD %41(p1) :: (load (s32), addrspace 1)
+    %43:_(s32) = G_CONSTANT i32 0
+    %44:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %42(s32), %43
+    %45:sreg_32_xm0_xexec(s32) = SI_IF %44(s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.5:
+    successors: %bb.3(0x80000000)
+
+    %35:_(s32) = G_PHI %46(s32), %bb.7, %13(s32), %bb.2
+    %37:_(s1) = G_PHI %47(s1), %bb.7, %27(s1), %bb.2
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+    G_BR %bb.3
+
+  bb.6:
+    successors: %bb.7(0x80000000)
+
+    %48:_(s32) = G_CONSTANT i32 2
+    %49:_(s64) = G_SHL %19, %48(s32)
+    %50:_(p1) = G_PTR_ADD %2, %49(s64)
+    %51:_(s32) = G_LOAD %50(p1) :: (load (s32), addrspace 1)
+    %52:_(s32) = G_CONSTANT i32 1
+    %53:_(s32) = G_ADD %51, %52
+    G_STORE %53(s32), %50(p1) :: (store (s32), addrspace 1)
+    %54:_(s32) = G_ADD %16, %52
+    %55:_(s32) = G_CONSTANT i32 100
+    %56:_(s1) = G_ICMP intpred(ult), %16(s32), %55
+
+  bb.7:
+    successors: %bb.5(0x80000000)
+
+    %46:_(s32) = G_PHI %54(s32), %bb.6, %13(s32), %bb.4
+    %47:_(s1) = G_PHI %56(s1), %bb.6, %38(s1), %bb.4
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %45(s32)
+    G_BR %bb.5
+
+  bb.8:
+    %57:_(s32) = G_PHI %15(s32), %bb.3
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %57(s32)
+    S_ENDPGM 0
+...
+
+---
+name: loop_with_div_break_with_body
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: loop_with_div_break_with_body
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GFX10-NEXT:   G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+  ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
+  ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(p1) = G_MERGE_VALUES %3(s32), %4(s32)
+    %6:_(s32) = COPY $vgpr4
+    %7:_(s32) = COPY $vgpr5
+    %8:_(p1) = G_MERGE_VALUES %6(s32), %7(s32)
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.3(0x40000000), %bb.5(0x40000000)
+
+    %11:_(s32) = G_PHI %12(s32), %bb.5, %9(s32), %bb.0
+    %13:_(s32) = G_PHI %9(s32), %bb.0, %14(s32), %bb.5
+    %15:_(s1) = G_CONSTANT i1 true
+    %16:_(s64) = G_SEXT %13(s32)
+    %17:_(s32) = G_CONSTANT i32 2
+    %18:_(s64) = G_SHL %16, %17(s32)
+    %19:_(p1) = G_PTR_ADD %5, %18(s64)
+    %20:_(s32) = G_LOAD %19(p1) :: (load (s32), addrspace 1)
+    %21:_(s32) = G_CONSTANT i32 0
+    %22:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), %20(s32), %21
+    %23:sreg_32_xm0_xexec(s32) = SI_IF %22(s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.4(0x80000000)
+
+    %24:_(s32) = G_CONSTANT i32 10
+    G_STORE %24(s32), %8(p1) :: (store (s32), addrspace 1)
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.5(0x80000000)
+
+    %25:_(s1) = G_CONSTANT i1 false
+    %26:_(s32) = G_CONSTANT i32 2
+    %27:_(s64) = G_SHL %16, %26(s32)
+    %28:_(p1) = G_PTR_ADD %2, %27(s64)
+    %29:_(s32) = G_LOAD %28(p1) :: (load (s32), addrspace 1)
+    %30:_(s32) = G_CONSTANT i32 1
+    %31:_(s32) = G_ADD %29, %30
+    G_STORE %31(s32), %28(p1) :: (store (s32), addrspace 1)
+    %32:_(s32) = G_ADD %13, %30
+    %33:_(s32) = G_CONSTANT i32 100
+    %34:_(s1) = G_ICMP intpred(ult), %13(s32), %33
+    G_BR %bb.5
+
+  bb.4:
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %35(s32)
+    S_ENDPGM 0
+
+  bb.5:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %14:_(s32) = G_PHI %32(s32), %bb.3, %10(s32), %bb.1
+    %36:_(s1) = G_PHI %25(s1), %bb.3, %15(s1), %bb.1
+    %37:_(s1) = G_PHI %34(s1), %bb.3, %15(s1), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %23(s32)
+    %12:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %37(s1), %11(s32)
+    SI_LOOP %12(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.6:
+    successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+    %38:sreg_32_xm0_xexec(s1) = G_PHI %36(s1), %bb.5
+    %39:_(s32) = G_PHI %12(s32), %bb.5
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %39(s32)
+    %35:sreg_32_xm0_xexec(s32) = SI_IF %38(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+...
+
+---
+name: irreducible_cfg
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: irreducible_cfg
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.7(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY4]](s32), [[COPY1]]
+  ; GFX10-NEXT:   G_BR %bb.7
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY4]](s32), [[COPY]]
+  ; GFX10-NEXT:   G_BR %bb.3
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, [[DEF]](s1), %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI1]](s1), %17(s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.3(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   successors: %bb.5(0x04000000), %bb.7(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
+  ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[COPY3]], [[COPY2]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
+  ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
+  ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.2(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.7:
+  ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI6]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.0, [[PHI]](s1), %bb.2, [[C2]](s1), %bb.4
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI8]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.1
+  bb.0:
+    successors: %bb.7(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(s32) = COPY $vgpr3
+    %4:_(s32) = COPY $vgpr4
+    %5:_(s32) = COPY $vgpr5
+    %6:_(s32) = G_CONSTANT i32 0
+    %7:_(s1) = G_IMPLICIT_DEF
+    %8:_(s1) = G_ICMP intpred(sgt), %4(s32), %1
+    G_BR %bb.7
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    %9:_(s32) = G_CONSTANT i32 0
+    %10:_(s1) = G_ICMP intpred(sle), %4(s32), %0
+    G_BR %bb.3
+
+  bb.2:
+    successors: %bb.4(0x40000000), %bb.7(0x40000000)
+
+    %11:_(s1) = G_PHI %12(s1), %bb.6, %7(s1), %bb.7
+    %13:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
+    %16:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %13(s1), %17(s32)
+    SI_LOOP %16(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.6(0x04000000), %bb.3(0x7c000000)
+
+    %18:_(s32) = G_PHI %9(s32), %bb.1, %19(s32), %bb.3
+    %19:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %10(s1), %18(s32)
+    SI_LOOP %19(s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.4:
+    successors: %bb.5(0x04000000), %bb.7(0x7c000000)
+
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %16(s32)
+    %20:_(s1) = G_ICMP intpred(sgt), %5(s32), %0
+    %21:_(s1) = G_CONSTANT i1 true
+    %22:_(s1) = G_XOR %8, %21
+    %23:_(s1) = G_OR %20, %22
+    %24:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %23(s1), %25(s32)
+    SI_LOOP %24(s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.5
+
+  bb.5:
+    %26:_(s1) = G_PHI %20(s1), %bb.4
+    %27:_(s32) = G_PHI %24(s32), %bb.4
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %27(s32)
+    %28:_(s32) = G_SELECT %26(s1), %3, %2
+    %29:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), %28(s32)
+    $sgpr0 = COPY %29(s32)
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+
+  bb.6:
+    successors: %bb.2(0x80000000)
+
+    %30:_(s32) = G_PHI %19(s32), %bb.3
+    %12:_(s1) = G_CONSTANT i1 false
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %30(s32)
+    G_BR %bb.2
+
+  bb.7:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+
+    %25:_(s32) = G_PHI %24(s32), %bb.4, %25(s32), %bb.2, %6(s32), %bb.0
+    %17:_(s32) = G_PHI %6(s32), %bb.4, %16(s32), %bb.2, %6(s32), %bb.0
+    %31:sreg_32_xm0_xexec(s1) = G_PHI %8(s1), %bb.0, %11(s1), %bb.2, %21(s1), %bb.4
+    %14:_(s1) = G_CONSTANT i1 true
+    %15:sreg_32_xm0_xexec(s32) = SI_IF %31(s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.1
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
new file mode 100644
index 0000000000000..e34a9c89f0ad2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -0,0 +1,171 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
+; GFX10-LABEL: temporal_divergent_i1_phi:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:  .LBB0_1: ; %loop
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v0
+; GFX10-NEXT:    v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v5
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    flat_store_dword v[1:2], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop ]
+  %bool_counter = phi i1 [ true, %entry ], [ %neg_bool_counter, %loop ]
+  %neg_bool_counter = xor i1 %bool_counter, true
+  %fcounter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %fcounter, %val
+  %counterPlus1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %select = select i1 %bool_counter, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
+; GFX10-LABEL: temporal_divergent_i1_non_phi:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:  .LBB1_1: ; %loop
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    flat_store_dword v[1:2], v0
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counterPlus1, %loop ]
+  %bool_counter = phi i1 [ true, %entry ], [ %neg_bool_counter, %loop ]
+  %neg_bool_counter = xor i1 %bool_counter, true
+  %fcounter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %fcounter, %val
+  %counterPlus1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  %select = select i1 %neg_bool_counter, float 1.000000e+00, float 0.000000e+00
+  store float %select, ptr %addr
+  ret void
+}
+
+; This is temporal divergent uniform i1 structurize-cfg phi.
+; Loop has uniform condition if with body with break at the end.
+define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 %x.size, i32 addrspace(1)* inreg %a, i32 addrspace(1)* inreg %a.break) {
+; GFX10-LABEL: loop_with_1break:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v4, s1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s0
+; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    s_branch .LBB2_3
+; GFX10-NEXT:  .LBB2_1: ; %loop.body
+; GFX10-NEXT:    ; in Loop: Header=BB2_3 Depth=1
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v5
+; GFX10-NEXT:    v_cmp_lt_u32_e64 s0, v5, v2
+; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    global_load_dword v8, v[6:7], off
+; GFX10-NEXT:    v_mov_b32_e32 v5, v9
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
+; GFX10-NEXT:    global_store_dword v[6:7], v8, off
+; GFX10-NEXT:  .LBB2_2: ; %Flow
+; GFX10-NEXT:    ; in Loop: Header=BB2_3 Depth=1
+; GFX10-NEXT:    s_and_b32 s0, exec_lo, s0
+; GFX10-NEXT:    s_or_b32 s4, s0, s4
+; GFX10-NEXT:    s_and_b32 s0, 1, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execz .LBB2_5
+; GFX10-NEXT:  .LBB2_3: ; %A
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v3, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
+; GFX10-NEXT:    global_load_dword v8, v[8:9], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    s_cbranch_vccnz .LBB2_1
+; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB2_3 Depth=1
+; GFX10-NEXT:    s_mov_b32 s0, -1
+; GFX10-NEXT:    s_mov_b32 s1, 1
+; GFX10-NEXT:    ; implicit-def: $vgpr5
+; GFX10-NEXT:    s_branch .LBB2_2
+; GFX10-NEXT:  .LBB2_5: ; %loop.exit.guard
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s1, s0
+; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
+; GFX10-NEXT:    s_cbranch_execz .LBB2_7
+; GFX10-NEXT:  ; %bb.6: ; %break.body
+; GFX10-NEXT:    v_mov_b32_e32 v0, 10
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX10-NEXT:  .LBB2_7: ; %exit
+; GFX10-NEXT:    s_endpgm
+entry:
+  br label %A
+
+A:
+  %counter = phi i32 [ %counter.plus.1, %loop.body ], [ 0, %entry ]
+  %a.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %counter
+  %a.val = load i32, i32 addrspace(1)* %a.plus.counter
+  %a.cond = icmp eq i32 %a.val, 0
+  br i1 %a.cond, label %break.body, label %loop.body
+
+break.body:
+  store i32 10, i32 addrspace(1)* %a.break
+  br label %exit
+
+loop.body:
+  %x.plus.counter = getelementptr inbounds i32, i32 addrspace(1)* %x, i32 %counter
+  %x.val = load i32, i32 addrspace(1)* %x.plus.counter
+  %x.val.plus.1 = add i32 %x.val, 1
+  store i32 %x.val.plus.1, i32 addrspace(1)* %x.plus.counter
+  %counter.plus.1 = add i32 %counter, 1
+  %x.cond = icmp ult i32 %counter, %x.size
+  br i1 %x.cond, label %exit, label %A
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
new file mode 100644
index 0000000000000..d8f3ef03b5df6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -0,0 +1,324 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: temporal_divergent_i1_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: temporal_divergent_i1_phi
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s1) = G_CONSTANT i1 true
+    %5:_(s32) = G_CONSTANT i32 0
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+    %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s1) = G_PHI %4(s1), %bb.0, %11(s1), %bb.1
+    %12:_(s1) = G_CONSTANT i1 true
+    %11:_(s1) = G_XOR %10, %12
+    %13:_(s32) = G_UITOFP %8(s32)
+    %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
+    %15:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %15
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %16:_(s1) = G_PHI %10(s1), %bb.1
+    %17:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+    %18:_(s32) = G_FCONSTANT float 0.000000e+00
+    %19:_(s32) = G_FCONSTANT float 1.000000e+00
+    %20:_(s32) = G_SELECT %16(s1), %19, %18
+    G_STORE %20(s32), %3(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: temporal_divergent_i1_non_phi
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: temporal_divergent_i1_non_phi
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s1) = G_CONSTANT i1 true
+    %5:_(s32) = G_CONSTANT i32 0
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %5(s32), %bb.0
+    %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s1) = G_PHI %4(s1), %bb.0, %11(s1), %bb.1
+    %12:_(s1) = G_CONSTANT i1 true
+    %11:_(s1) = G_XOR %10, %12
+    %13:_(s32) = G_UITOFP %8(s32)
+    %14:_(s1) = G_FCMP floatpred(ogt), %13(s32), %0
+    %15:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %15
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %14(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %16:_(s1) = G_PHI %11(s1), %bb.1
+    %17:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %17(s32)
+    %18:_(s32) = G_FCONSTANT float 0.000000e+00
+    %19:_(s32) = G_FCONSTANT float 1.000000e+00
+    %20:_(s32) = G_SELECT %16(s1), %19, %18
+    G_STORE %20(s32), %3(p0) :: (store (s32))
+    SI_RETURN
+...
+
+---
+name: loop_with_1break
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: loop_with_1break
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY]](s32), [[COPY1]](s32)
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY $sgpr1
+  ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $sgpr2
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:_(s32) = COPY $sgpr3
+  ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.3(0x50000000), %bb.5(0x30000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.3
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   successors: %bb.4(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
+  ; GFX10-NEXT:   G_STORE [[C4]](s32), [[MV2]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_BR %bb.4
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.3:
+  ; GFX10-NEXT:   successors: %bb.5(0x80000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
+  ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
+  ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
+  ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[COPY2]]
+  ; GFX10-NEXT:   G_BR %bb.5
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.4:
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+  ; GFX10-NEXT:   S_ENDPGM 0
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.6
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(p1) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $sgpr0
+    %5:_(s32) = COPY $sgpr1
+    %6:_(p1) = G_MERGE_VALUES %4(s32), %5(s32)
+    %7:_(s32) = COPY $sgpr2
+    %8:_(s32) = COPY $sgpr3
+    %9:_(p1) = G_MERGE_VALUES %7(s32), %8(s32)
+    %10:_(s32) = G_CONSTANT i32 0
+    %11:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.3(0x50000000), %bb.5(0x30000000)
+
+    %12:_(s32) = G_PHI %13(s32), %bb.5, %10(s32), %bb.0
+    %14:_(s32) = G_PHI %10(s32), %bb.0, %15(s32), %bb.5
+    %16:_(s1) = G_CONSTANT i1 true
+    %17:_(s64) = G_SEXT %14(s32)
+    %18:_(s32) = G_CONSTANT i32 2
+    %19:_(s64) = G_SHL %17, %18(s32)
+    %20:_(p1) = G_PTR_ADD %6, %19(s64)
+    %21:_(s32) = G_LOAD %20(p1) :: (load (s32), addrspace 1)
+    %22:_(s32) = G_CONSTANT i32 0
+    %23:_(s1) = G_ICMP intpred(ne), %21(s32), %22
+    G_BRCOND %23(s1), %bb.3
+    G_BR %bb.5
+
+  bb.2:
+    successors: %bb.4(0x80000000)
+
+    %24:_(s32) = G_CONSTANT i32 10
+    G_STORE %24(s32), %9(p1) :: (store (s32), addrspace 1)
+    G_BR %bb.4
+
+  bb.3:
+    successors: %bb.5(0x80000000)
+
+    %25:_(s1) = G_CONSTANT i1 false
+    %26:_(s32) = G_CONSTANT i32 2
+    %27:_(s64) = G_SHL %17, %26(s32)
+    %28:_(p1) = G_PTR_ADD %2, %27(s64)
+    %29:_(s32) = G_LOAD %28(p1) :: (load (s32), addrspace 1)
+    %30:_(s32) = G_CONSTANT i32 1
+    %31:_(s32) = G_ADD %29, %30
+    G_STORE %31(s32), %28(p1) :: (store (s32), addrspace 1)
+    %32:_(s32) = G_ADD %14, %30
+    %33:_(s1) = G_ICMP intpred(ult), %14(s32), %3
+    G_BR %bb.5
+
+  bb.4:
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %34(s32)
+    S_ENDPGM 0
+
+  bb.5:
+    successors: %bb.6(0x04000000), %bb.1(0x7c000000)
+
+    %15:_(s32) = G_PHI %32(s32), %bb.3, %11(s32), %bb.1
+    %35:_(s1) = G_PHI %25(s1), %bb.3, %16(s1), %bb.1
+    %36:_(s1) = G_PHI %33(s1), %bb.3, %16(s1), %bb.1
+    %13:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %36(s1), %12(s32)
+    SI_LOOP %13(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.6
+
+  bb.6:
+    successors: %bb.2(0x40000000), %bb.4(0x40000000)
+
+    %37:sreg_32_xm0_xexec(s1) = G_PHI %35(s1), %bb.5
+    %38:_(s32) = G_PHI %13(s32), %bb.5
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %38(s32)
+    %34:sreg_32_xm0_xexec(s32) = SI_IF %37(s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
new file mode 100644
index 0000000000000..d065f228a4e4a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+
+define void @temporal_divergent_i32(float %val, ptr %addr) {
+; GFX10-LABEL: temporal_divergent_i32:
+; GFX10:       ; %bb.0: ; %entry
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_mov_b32 s4, -1
+; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB0_1: ; %loop
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX10-NEXT:  ; %bb.2: ; %exit
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    flat_store_dword v[1:2], v3
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  br label %loop
+
+loop:
+  %counter = phi i32 [ 0, %entry ], [ %counter.plus.1, %loop ]
+  %f.counter = uitofp i32 %counter to float
+  %cond = fcmp ogt float %f.counter, %val
+  %counter.plus.1 = add i32 %counter, 1
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  store i32 %counter, ptr %addr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
new file mode 100644
index 0000000000000..cc6ab89b34b39
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
@@ -0,0 +1,70 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+
+---
+name: temporal_divergent_i32
+legalized: true
+tracksRegLiveness: true
+body: |
+  ; GFX10-LABEL: name: temporal_divergent_i32
+  ; GFX10: bb.0:
+  ; GFX10-NEXT:   successors: %bb.1(0x80000000)
+  ; GFX10-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+  ; GFX10-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+  ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+  ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+  ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.1:
+  ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[ADD]](s32)
+  ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   G_BR %bb.2
+  ; GFX10-NEXT: {{  $}}
+  ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s32)
+  ; GFX10-NEXT:   G_STORE [[PHI2]](s32), [[MV]](p0) :: (store (s32))
+  ; GFX10-NEXT:   SI_RETURN
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = COPY $vgpr2
+    %3:_(p0) = G_MERGE_VALUES %1(s32), %2(s32)
+    %4:_(s32) = G_CONSTANT i32 0
+    %5:_(s32) = G_CONSTANT i32 -1
+
+  bb.1:
+    successors: %bb.2(0x04000000), %bb.1(0x7c000000)
+
+    %6:_(s32) = G_PHI %7(s32), %bb.1, %4(s32), %bb.0
+    %8:_(s32) = G_PHI %5(s32), %bb.0, %9(s32), %bb.1
+    %10:_(s32) = G_CONSTANT i32 1
+    %9:_(s32) = G_ADD %8, %10
+    %11:_(s32) = G_UITOFP %9(s32)
+    %12:_(s1) = G_FCMP floatpred(ogt), %11(s32), %0
+    %7:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), %12(s1), %6(s32)
+    SI_LOOP %7(s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
+    G_BR %bb.2
+
+  bb.2:
+    %13:_(s32) = G_PHI %9(s32), %bb.1
+    %14:_(s32) = G_PHI %7(s32), %bb.1
+    G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %14(s32)
+    G_STORE %13(s32), %3(p0) :: (store (s32))
+    SI_RETURN
+...

>From a63cac246427324776be5a5d17d719b5757416e0 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Fri, 24 Nov 2023 15:14:55 +0100
Subject: [PATCH 2/6] AMDGPU: refactor phi lowering from SILowerI1Copies (NFCI)

Make abstract class PhiLoweringHelper and expose it for use in
GlobalISel path.
SILowerI1Copies implements PhiLoweringHelper as Vreg1LoweringHelper
and it is equivalent to SILowerI1Copies.
Notable change that createLaneMaskReg now clones attributes from
register that has lane mask attributes instead of creating register
with lane mask register class. This is because lane masks have
different(more) attributes in GlobalISel.
---
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp | 319 +++++++++++----------
 llvm/lib/Target/AMDGPU/SILowerI1Copies.h   |  79 +++++
 2 files changed, 248 insertions(+), 150 deletions(-)
 create mode 100644 llvm/lib/Target/AMDGPU/SILowerI1Copies.h

diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 68c8f4024e730..59c05bab531f4 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -21,57 +21,26 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "SILowerI1Copies.h"
 #include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Target/CGPassBuilderOption.h"
 
 #define DEBUG_TYPE "si-i1-copies"
 
 using namespace llvm;
 
-static unsigned createLaneMaskReg(MachineFunction &MF);
-static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+static Register insertUndefLaneMask(MachineBasicBlock *MBB,
+                                    MachineRegisterInfo *MRI,
+                                    Register *LaneMaskRegAttrs);
 
 namespace {
 
-struct Incoming {
-  Register Reg;
-  MachineBasicBlock *Block;
-  Register UpdatedReg;
-
-  Incoming(Register Reg, MachineBasicBlock *Block, Register UpdatedReg)
-      : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
-};
-
 class SILowerI1Copies : public MachineFunctionPass {
 public:
   static char ID;
 
-private:
-  bool IsWave32 = false;
-  MachineFunction *MF = nullptr;
-  MachineDominatorTree *DT = nullptr;
-  MachinePostDominatorTree *PDT = nullptr;
-  MachineRegisterInfo *MRI = nullptr;
-  const GCNSubtarget *ST = nullptr;
-  const SIInstrInfo *TII = nullptr;
-
-  unsigned ExecReg;
-  unsigned MovOp;
-  unsigned AndOp;
-  unsigned OrOp;
-  unsigned XorOp;
-  unsigned AndN2Op;
-  unsigned OrN2Op;
-
-  DenseSet<unsigned> ConstrainRegs;
-
-public:
   SILowerI1Copies() : MachineFunctionPass(ID) {
     initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
   }
@@ -86,29 +55,53 @@ class SILowerI1Copies : public MachineFunctionPass {
     AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+};
+
+class Vreg1LoweringHelper : public PhiLoweringHelper {
+public:
+  Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+                      MachinePostDominatorTree *PDT);
 
 private:
-  bool lowerCopiesFromI1();
-  bool lowerPhis();
-  bool lowerCopiesToI1();
-  bool isConstantLaneMask(Register Reg, bool &Val) const;
+  DenseSet<Register> ConstrainRegs;
+
+public:
+  void markAsLaneMask(Register DstReg) const override;
+  void getCandidatesForLowering(
+      SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
+  void collectIncomingValuesFromPhi(
+      const MachineInstr *MI,
+      SmallVectorImpl<Incoming> &Incomings) const override;
+  void replaceDstReg(Register NewReg, Register OldReg,
+                     MachineBasicBlock *MBB) override;
   void buildMergeLaneMasks(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I, const DebugLoc &DL,
-                           unsigned DstReg, unsigned PrevReg, unsigned CurReg);
-  MachineBasicBlock::iterator
-  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+                           Register DstReg, Register PrevReg,
+                           Register CurReg) override;
+  void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
 
+  bool lowerCopiesFromI1();
+  bool lowerCopiesToI1();
+  bool cleanConstrainRegs(bool Changed);
   bool isVreg1(Register Reg) const {
     return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
   }
-
-  bool isLaneMaskReg(unsigned Reg) const {
-    return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
-           TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
-               ST->getWavefrontSize();
-  }
 };
 
+Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
+                                         MachineDominatorTree *DT,
+                                         MachinePostDominatorTree *PDT)
+    : PhiLoweringHelper(MF, DT, PDT) {}
+
+bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
+  assert(Changed || ConstrainRegs.empty());
+  for (Register Reg : ConstrainRegs)
+    MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
+  ConstrainRegs.clear();
+
+  return Changed;
+}
+
 /// Helper class that determines the relationship between incoming values of a
 /// phi in the control flow graph to determine where an incoming value can
 /// simply be taken as a scalar lane mask as-is, and where it needs to be
@@ -311,6 +304,7 @@ class LoopFinder {
   /// blocks, so that the SSA updater doesn't have to search all the way to the
   /// function entry.
   void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
+                      MachineRegisterInfo &MRI, Register *LaneMaskRegAttrs,
                       ArrayRef<Incoming> Incomings = {}) {
     assert(LoopLevel < CommonDominators.size());
 
@@ -319,13 +313,15 @@ class LoopFinder {
       Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
 
     if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
-      SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+      SSAUpdater.AddAvailableValue(
+          Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
     } else {
       // The dominator is part of the loop or the given blocks, so add the
       // undef value to unreachable predecessors instead.
       for (MachineBasicBlock *Pred : Dom->predecessors()) {
         if (!inLoopLevel(*Pred, LoopLevel, Incomings))
-          SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+          SSAUpdater.AddAvailableValue(
+              Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
       }
     }
   }
@@ -415,19 +411,19 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-static unsigned createLaneMaskReg(MachineFunction &MF) {
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass
-                                                 : &AMDGPU::SReg_64RegClass);
+Register createLaneMaskReg(MachineRegisterInfo *MRI,
+                           Register *LaneMaskRegAttrs) {
+  return MRI->cloneVirtualRegister(*LaneMaskRegAttrs);
 }
 
-static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
-  MachineFunction &MF = *MBB.getParent();
+static Register insertUndefLaneMask(MachineBasicBlock *MBB,
+                                    MachineRegisterInfo *MRI,
+                                    Register *LaneMaskRegAttrs) {
+  MachineFunction &MF = *MBB->getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  unsigned UndefReg = createLaneMaskReg(MF);
-  BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+  Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
           UndefReg);
   return UndefReg;
 }
@@ -444,47 +440,17 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
   // Only need to run this in SelectionDAG path.
   if (TheMF.getProperties().hasProperty(
-        MachineFunctionProperties::Property::Selected))
+          MachineFunctionProperties::Property::Selected))
     return false;
 
-  MF = &TheMF;
-  MRI = &MF->getRegInfo();
-  DT = &getAnalysis<MachineDominatorTree>();
-  PDT = &getAnalysis<MachinePostDominatorTree>();
-
-  ST = &MF->getSubtarget<GCNSubtarget>();
-  TII = ST->getInstrInfo();
-  IsWave32 = ST->isWave32();
-
-  if (IsWave32) {
-    ExecReg = AMDGPU::EXEC_LO;
-    MovOp = AMDGPU::S_MOV_B32;
-    AndOp = AMDGPU::S_AND_B32;
-    OrOp = AMDGPU::S_OR_B32;
-    XorOp = AMDGPU::S_XOR_B32;
-    AndN2Op = AMDGPU::S_ANDN2_B32;
-    OrN2Op = AMDGPU::S_ORN2_B32;
-  } else {
-    ExecReg = AMDGPU::EXEC;
-    MovOp = AMDGPU::S_MOV_B64;
-    AndOp = AMDGPU::S_AND_B64;
-    OrOp = AMDGPU::S_OR_B64;
-    XorOp = AMDGPU::S_XOR_B64;
-    AndN2Op = AMDGPU::S_ANDN2_B64;
-    OrN2Op = AMDGPU::S_ORN2_B64;
-  }
+  Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(),
+                             &getAnalysis<MachinePostDominatorTree>());
 
   bool Changed = false;
-  Changed |= lowerCopiesFromI1();
-  Changed |= lowerPhis();
-  Changed |= lowerCopiesToI1();
-
-  assert(Changed || ConstrainRegs.empty());
-  for (unsigned Reg : ConstrainRegs)
-    MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
-  ConstrainRegs.clear();
-
-  return Changed;
+  Changed |= Helper.lowerCopiesFromI1();
+  Changed |= Helper.lowerPhis();
+  Changed |= Helper.lowerCopiesToI1();
+  return Helper.cleanConstrainRegs(Changed);
 }
 
 #ifndef NDEBUG
@@ -496,7 +462,7 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
 }
 #endif
 
-bool SILowerI1Copies::lowerCopiesFromI1() {
+bool Vreg1LoweringHelper::lowerCopiesFromI1() {
   bool Changed = false;
   SmallVector<MachineInstr *, 4> DeadCopies;
 
@@ -539,23 +505,43 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
   return Changed;
 }
 
-bool SILowerI1Copies::lowerPhis() {
+PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF,
+                                     MachineDominatorTree *DT,
+                                     MachinePostDominatorTree *PDT)
+    : MF(MF), DT(DT), PDT(PDT) {
+  MRI = &MF->getRegInfo();
+
+  ST = &MF->getSubtarget<GCNSubtarget>();
+  TII = ST->getInstrInfo();
+  IsWave32 = ST->isWave32();
+
+  if (IsWave32) {
+    ExecReg = AMDGPU::EXEC_LO;
+    MovOp = AMDGPU::S_MOV_B32;
+    AndOp = AMDGPU::S_AND_B32;
+    OrOp = AMDGPU::S_OR_B32;
+    XorOp = AMDGPU::S_XOR_B32;
+    AndN2Op = AMDGPU::S_ANDN2_B32;
+    OrN2Op = AMDGPU::S_ORN2_B32;
+  } else {
+    ExecReg = AMDGPU::EXEC;
+    MovOp = AMDGPU::S_MOV_B64;
+    AndOp = AMDGPU::S_AND_B64;
+    OrOp = AMDGPU::S_OR_B64;
+    XorOp = AMDGPU::S_XOR_B64;
+    AndN2Op = AMDGPU::S_ANDN2_B64;
+    OrN2Op = AMDGPU::S_ORN2_B64;
+  }
+}
+
+bool PhiLoweringHelper::lowerPhis() {
   MachineSSAUpdater SSAUpdater(*MF);
   LoopFinder LF(*DT, *PDT);
   PhiIncomingAnalysis PIA(*PDT, TII);
   SmallVector<MachineInstr *, 4> Vreg1Phis;
   SmallVector<Incoming, 4> Incomings;
 
-#ifndef NDEBUG
-  DenseSet<unsigned> PhiRegisters;
-#endif
-
-  for (MachineBasicBlock &MBB : *MF) {
-    for (MachineInstr &MI : MBB.phis()) {
-      if (isVreg1(MI.getOperand(0).getReg()))
-        Vreg1Phis.push_back(&MI);
-    }
-  }
+  getCandidatesForLowering(Vreg1Phis);
   if (Vreg1Phis.empty())
     return false;
 
@@ -571,28 +557,10 @@ bool SILowerI1Copies::lowerPhis() {
     LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
 
     Register DstReg = MI->getOperand(0).getReg();
-    MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
-                                      : &AMDGPU::SReg_64RegClass);
-
-    // Collect incoming values.
-    for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
-      assert(i + 1 < MI->getNumOperands());
-      Register IncomingReg = MI->getOperand(i).getReg();
-      MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
-      MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
-
-      if (IncomingDef->getOpcode() == AMDGPU::COPY) {
-        IncomingReg = IncomingDef->getOperand(1).getReg();
-        assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
-        assert(!IncomingDef->getOperand(1).getSubReg());
-      } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
-        continue;
-      } else {
-        assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
-      }
+    markAsLaneMask(DstReg);
+    initializeLaneMaskRegisterAttributes(&DstReg);
 
-      Incomings.emplace_back(IncomingReg, IncomingMBB, Register{});
-    }
+    collectIncomingValuesFromPhi(MI, Incomings);
 
     // Sort the incomings such that incoming values that dominate other incoming
     // values are sorted earlier. This allows us to do some amount of on-the-fly
@@ -625,10 +593,11 @@ bool SILowerI1Copies::lowerPhis() {
     SSAUpdater.Initialize(DstReg);
 
     if (FoundLoopLevel) {
-      LF.addLoopEntries(FoundLoopLevel, SSAUpdater, Incomings);
+      LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
+                        Incomings);
 
       for (auto &Incoming : Incomings) {
-        Incoming.UpdatedReg = createLaneMaskReg(*MF);
+        Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
         SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg);
       }
 
@@ -644,14 +613,16 @@ bool SILowerI1Copies::lowerPhis() {
       PIA.analyze(MBB, Incomings);
 
       for (MachineBasicBlock *MBB : PIA.predecessors())
-        SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+        SSAUpdater.AddAvailableValue(
+            MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs));
 
       for (auto &Incoming : Incomings) {
         MachineBasicBlock &IMBB = *Incoming.Block;
         if (PIA.isSource(IMBB)) {
+          constrainIncomingRegisterTakenAsIs(Incoming);
           SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
         } else {
-          Incoming.UpdatedReg = createLaneMaskReg(*MF);
+          Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
           SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg);
         }
       }
@@ -669,7 +640,7 @@ bool SILowerI1Copies::lowerPhis() {
 
     Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
     if (NewReg != DstReg) {
-      MRI->replaceRegWith(NewReg, DstReg);
+      replaceDstReg(NewReg, DstReg, &MBB);
       MI->eraseFromParent();
     }
 
@@ -678,7 +649,7 @@ bool SILowerI1Copies::lowerPhis() {
   return true;
 }
 
-bool SILowerI1Copies::lowerCopiesToI1() {
+bool Vreg1LoweringHelper::lowerCopiesToI1() {
   bool Changed = false;
   MachineSSAUpdater SSAUpdater(*MF);
   LoopFinder LF(*DT, *PDT);
@@ -705,8 +676,9 @@ bool SILowerI1Copies::lowerCopiesToI1() {
 
       LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
 
-      MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
-                                        : &AMDGPU::SReg_64RegClass);
+      markAsLaneMask(DstReg);
+      initializeLaneMaskRegisterAttributes(&DstReg);
+
       if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
         continue;
 
@@ -716,7 +688,7 @@ bool SILowerI1Copies::lowerCopiesToI1() {
 
       if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
         assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
-        unsigned TmpReg = createLaneMaskReg(*MF);
+        Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
         BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
             .addReg(SrcReg)
             .addImm(0);
@@ -739,7 +711,7 @@ bool SILowerI1Copies::lowerCopiesToI1() {
       if (FoundLoopLevel) {
         SSAUpdater.Initialize(DstReg);
         SSAUpdater.AddAvailableValue(&MBB, DstReg);
-        LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+        LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
 
         buildMergeLaneMasks(MBB, MI, DL, DstReg,
                             SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
@@ -754,7 +726,7 @@ bool SILowerI1Copies::lowerCopiesToI1() {
   return Changed;
 }
 
-bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
+bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const {
   const MachineInstr *MI;
   for (;;) {
     MI = MRI->getUniqueVRegDef(Reg);
@@ -807,7 +779,7 @@ static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
 /// Return a point at the end of the given \p MBB to insert SALU instructions
 /// for lane mask calculation. Take terminators and SCC into account.
 MachineBasicBlock::iterator
-SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
   auto InsertionPt = MBB.getFirstTerminator();
   bool TerminatorsUseSCC = false;
   for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
@@ -833,10 +805,53 @@ SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
   llvm_unreachable("SCC used by terminator but no def in block");
 }
 
-void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator I,
-                                          const DebugLoc &DL, unsigned DstReg,
-                                          unsigned PrevReg, unsigned CurReg) {
+// VReg_1 -> SReg_32 or SReg_64
+void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
+  MRI->setRegClass(DstReg, ST->getBoolRC());
+}
+
+void Vreg1LoweringHelper::getCandidatesForLowering(
+    SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB.phis()) {
+      if (isVreg1(MI.getOperand(0).getReg()))
+        Vreg1Phis.push_back(&MI);
+    }
+  }
+}
+
+void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
+    const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
+  for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
+    assert(i + 1 < MI->getNumOperands());
+    Register IncomingReg = MI->getOperand(i).getReg();
+    MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
+    MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+    if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+      IncomingReg = IncomingDef->getOperand(1).getReg();
+      assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
+      assert(!IncomingDef->getOperand(1).getSubReg());
+    } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+      continue;
+    } else {
+      assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
+    }
+
+    Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
+  }
+}
+
+void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
+                                        MachineBasicBlock *MBB) {
+  MRI->replaceRegWith(NewReg, OldReg);
+}
+
+void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              const DebugLoc &DL,
+                                              Register DstReg, Register PrevReg,
+                                              Register CurReg) {
   bool PrevVal = false;
   bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
   bool CurVal = false;
@@ -855,13 +870,13 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
     return;
   }
 
-  unsigned PrevMaskedReg = 0;
-  unsigned CurMaskedReg = 0;
+  Register PrevMaskedReg;
+  Register CurMaskedReg;
   if (!PrevConstant) {
     if (CurConstant && CurVal) {
       PrevMaskedReg = PrevReg;
     } else {
-      PrevMaskedReg = createLaneMaskReg(*MF);
+      PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
       BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
           .addReg(PrevReg)
           .addReg(ExecReg);
@@ -872,7 +887,7 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
     if (PrevConstant && PrevVal) {
       CurMaskedReg = CurReg;
     } else {
-      CurMaskedReg = createLaneMaskReg(*MF);
+      CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
       BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
           .addReg(CurReg)
           .addReg(ExecReg);
@@ -895,3 +910,7 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
         .addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
   }
 }
+
+void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
+  return;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
new file mode 100644
index 0000000000000..99de6ecee5777
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -0,0 +1,79 @@
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+
+using namespace llvm;
+
+struct Incoming {
+  Register Reg;
+  MachineBasicBlock *Block;
+  Register UpdatedReg;
+
+  Incoming(Register Reg, MachineBasicBlock *Block, Register UpdatedReg)
+      : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
+};
+
+Register createLaneMaskReg(MachineRegisterInfo *MRI,
+                           Register *LaneMaskRegAttrs);
+
+class PhiLoweringHelper {
+public:
+  PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+                    MachinePostDominatorTree *PDT);
+  virtual ~PhiLoweringHelper() = default;
+
+protected:
+  bool IsWave32 = false;
+  MachineFunction *MF = nullptr;
+  MachineDominatorTree *DT = nullptr;
+  MachinePostDominatorTree *PDT = nullptr;
+  MachineRegisterInfo *MRI = nullptr;
+  const GCNSubtarget *ST = nullptr;
+  const SIInstrInfo *TII = nullptr;
+  Register *LaneMaskRegAttrs = nullptr;
+
+#ifndef NDEBUG
+  DenseSet<Register> PhiRegisters;
+#endif
+
+  Register ExecReg;
+  unsigned MovOp;
+  unsigned AndOp;
+  unsigned OrOp;
+  unsigned XorOp;
+  unsigned AndN2Op;
+  unsigned OrN2Op;
+
+public:
+  bool lowerPhis();
+  bool isConstantLaneMask(Register Reg, bool &Val) const;
+  MachineBasicBlock::iterator
+  getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+  void initializeLaneMaskRegisterAttributes(Register *LaneMask) {
+    LaneMaskRegAttrs = LaneMask;
+  }
+
+  bool isLaneMaskReg(Register Reg) const {
+    return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+           TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+               ST->getWavefrontSize();
+  }
+
+  // Helpers from lowerPhis that are different between sdag and global-isel.
+
+  virtual void markAsLaneMask(Register DstReg) const = 0;
+  virtual void getCandidatesForLowering(
+      SmallVectorImpl<MachineInstr *> &Vreg1Phis) const = 0;
+  virtual void
+  collectIncomingValuesFromPhi(const MachineInstr *MI,
+                               SmallVectorImpl<Incoming> &Incomings) const = 0;
+  virtual void replaceDstReg(Register NewReg, Register OldReg,
+                             MachineBasicBlock *MBB) = 0;
+  virtual void buildMergeLaneMasks(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   const DebugLoc &DL, Register DstReg,
+                                   Register PrevReg, Register CurReg) = 0;
+  virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
+};
\ No newline at end of file

>From c9cd078b490de295d9a1bc54b62b1e5d17d7d4f9 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Fri, 24 Nov 2023 15:34:26 +0100
Subject: [PATCH 3/6] AMDGPU/GlobalISelDivergenceLowering: select divergent i1
 phis

Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper.
Use machine uniformity analysis to find divergent i1 phis and select them
as lane mask phis in same way SILowerI1Copies select VReg_1 phis.
Note that divergent i1 phis include phis created by LCSSA and all cases
of uses outside of cycle are actually covered by "lowering LCSSA phis".
GlobalISel lane masks are registers with sgpr register class and S1 LLT.

TODO: General goal is that instructions created in this pass are fully
instruction-selected so that selection of lane mask phis is not split
across multiple passes.
---
 .../AMDGPUGlobalISelDivergenceLowering.cpp    | 136 +++++++
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  32 ++
 ...-divergent-i1-phis-no-lane-mask-merging.ll |   1 +
 ...divergent-i1-phis-no-lane-mask-merging.mir |  70 ++--
 ...vergence-divergent-i1-used-outside-loop.ll |   1 +
 ...ergence-divergent-i1-used-outside-loop.mir | 334 +++++++++++++-----
 .../GlobalISel/divergence-structurizer.ll     |   1 +
 .../GlobalISel/divergence-structurizer.mir    | 286 +++++++++++----
 .../divergence-temporal-divergent-i1.ll       |   1 +
 .../divergence-temporal-divergent-i1.mir      |  96 +++--
 .../GlobalISel/divergent-control-flow.ll      |   1 +
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   1 +
 12 files changed, 754 insertions(+), 206 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index 5a4ae0d00b561..24be01fc1b0f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -14,7 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "SILowerI1Copies.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "global-isel-divergence-lowering"
 
@@ -40,14 +43,136 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineCycleInfoWrapperPass>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 
+class DivergenceLoweringHelper : public PhiLoweringHelper {
+public:
+  DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+                           MachinePostDominatorTree *PDT,
+                           MachineUniformityInfo *MUI);
+
+private:
+  MachineUniformityInfo *MUI = nullptr;
+
+public:
+  void markAsLaneMask(Register DstReg) const override;
+  void getCandidatesForLowering(
+      SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
+  void collectIncomingValuesFromPhi(
+      const MachineInstr *MI,
+      SmallVectorImpl<Incoming> &Incomings) const override;
+  void replaceDstReg(Register NewReg, Register OldReg,
+                     MachineBasicBlock *MBB) override;
+  void buildMergeLaneMasks(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, const DebugLoc &DL,
+                           Register DstReg, Register PrevReg,
+                           Register CurReg) override;
+  void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
+};
+
+DivergenceLoweringHelper::DivergenceLoweringHelper(
+    MachineFunction *MF, MachineDominatorTree *DT,
+    MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
+    : PhiLoweringHelper(MF, DT, PDT), MUI(MUI) {}
+
+// _(s1) -> SReg_32/64(s1)
+void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
+  assert(MRI->getType(DstReg) == LLT::scalar(1));
+
+  // Can't blindly set a register class on phi, users could have reg class
+  // constraints (e.g. sreg_32/64..._xexec classes for control flow intrinsics).
+  if (MRI->getRegClassOrNull(DstReg))
+    return;
+
+  MRI->setRegClass(DstReg, ST->getBoolRC());
+  return;
+}
+
+void DivergenceLoweringHelper::getCandidatesForLowering(
+    SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
+  LLT S1 = LLT::scalar(1);
+
+  // Add divergent i1 phis to the list
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB.phis()) {
+      Register Dst = MI.getOperand(0).getReg();
+      if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
+        Vreg1Phis.push_back(&MI);
+    }
+  }
+
+  return;
+}
+
+void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
+    const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
+  for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
+    Incomings.emplace_back(MI->getOperand(i).getReg(),
+                           MI->getOperand(i + 1).getMBB(), Register());
+  }
+}
+
+void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
+                                             MachineBasicBlock *MBB) {
+  BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
+      .addReg(NewReg);
+}
+
+// Get pointers to build instruction just after MI (skips phis if needed)
+static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
+getInsertAfterPtrs(MachineInstr *MI) {
+  MachineBasicBlock *InsertMBB = MI->getParent();
+  return std::make_pair(
+      InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+}
+
+void DivergenceLoweringHelper::buildMergeLaneMasks(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
+    Register DstReg, Register PrevReg, Register CurReg) {
+  // TODO: check if inputs are constants or results of a compare.
+
+  Register PrevRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  auto [PrevMBB, AfterPrevReg] = getInsertAfterPtrs(MRI->getVRegDef(PrevReg));
+  BuildMI(*PrevMBB, AfterPrevReg, DL, TII->get(AMDGPU::COPY), PrevRegCopy)
+      .addReg(PrevReg);
+  Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
+      .addReg(PrevRegCopy)
+      .addReg(ExecReg);
+
+  Register CurRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  auto [CurMBB, AfterCurReg] = getInsertAfterPtrs(MRI->getVRegDef(CurReg));
+  BuildMI(*CurMBB, AfterCurReg, DL, TII->get(AMDGPU::COPY), CurRegCopy)
+      .addReg(CurReg);
+  Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
+      .addReg(ExecReg)
+      .addReg(CurRegCopy);
+
+  BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
+      .addReg(PrevMaskedReg)
+      .addReg(CurMaskedReg);
+
+  return;
+}
+
+void DivergenceLoweringHelper::constrainIncomingRegisterTakenAsIs(
+    Incoming &In) {
+  return;
+}
+
 } // End anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                       "GlobalISel divergence lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                     "GlobalISel divergence lowering", false, false)
 
@@ -62,5 +187,16 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
 
 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
     MachineFunction &MF) {
+  MachineCycleInfo &CycleInfo =
+      getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+  MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();
+
+  MachineUniformityInfo MUI =
+      computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(), true);
+
+  DivergenceLoweringHelper Helper(
+      &MF, &DomTree, &getAnalysis<MachinePostDominatorTree>(), &MUI);
+
+  Helper.lowerPhis();
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index b772efe04c714..3146f7f5a2185 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -207,7 +207,39 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
   return true;
 }
 
+bool isLaneMask(Register Reg, MachineRegisterInfo *MRI,
+                const SIRegisterInfo &TRI) {
+  if (MRI->getType(Reg) != LLT::scalar(1))
+    return false;
+  const TargetRegisterClass *RC = MRI->getRegClassOrNull(Reg);
+  if (!RC || !TRI.isSGPRClass(RC))
+    return false;
+
+  return true;
+}
+
+// PHI where all register operands are sgpr(register class) with S1 LLT.
+bool isLaneMaskPhi(MachineInstr &I, MachineRegisterInfo *MRI,
+                   const SIRegisterInfo &TRI) {
+  if (I.getOpcode() != AMDGPU::PHI)
+    return false;
+
+  if (!isLaneMask(I.getOperand(0).getReg(), MRI, TRI))
+    return false;
+
+  for (unsigned i = 1, e = I.getNumOperands(); i != e; e += 2) {
+    if (!isLaneMask(I.getOperand(i).getReg(), MRI, TRI))
+      return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
+  // Already selected in divergence lowering pass
+  if (isLaneMaskPhi(I, MRI, TRI))
+    return true;
+
   const Register DefReg = I.getOperand(0).getReg();
   const LLT DefTy = MRI->getType(DefReg);
   if (DefTy == LLT::scalar(1)) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index b909d78d5e874..ce0547a991703 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 ; Divergent phis that don't require lowering using lane mask merging
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index a46bc505ceb59..b6864a00fba11 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -46,7 +46,7 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -126,6 +126,7 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
@@ -136,12 +137,17 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -191,30 +197,37 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -279,25 +292,28 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI1]](s32), [[C3]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.4
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C4]]
   ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -320,22 +336,26 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C8]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[XOR1]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C11]], [[C10]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -468,7 +488,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
   ; GFX10-NEXT:   G_BRCOND [[PHI1]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 7a68b94eb9a3a..615a3787f3cd6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 671ccbdc2290c..5d88d3c1fc6cd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -19,30 +19,49 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -103,42 +122,67 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %13(s1), %bb.3
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %41, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI3]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI1]], [[C3]](s64)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI3]], [[C3]](s64)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI2]], [[C4]]
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[ADD]](s32), [[C5]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.1
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.3
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C7]], [[C6]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[C7]], [[C6]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -211,30 +255,37 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -298,6 +349,7 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -305,29 +357,49 @@ body: |
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, [[C1]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.1, %61, %bb.7
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.1, %48, %bb.7
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI5]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -335,9 +407,17 @@ body: |
   ; GFX10-NEXT:   successors: %bb.7(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C7]]
-  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI2]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -353,22 +433,32 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[C6]](s1), %bb.4, [[C3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI5]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.7
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY19]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -479,35 +569,40 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI1]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI2]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
-  ; GFX10-NEXT:   G_STORE [[PHI1]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_STORE [[PHI2]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -515,27 +610,35 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C3]](s1), %bb.5, [[C2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.6
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.6
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY11]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
@@ -648,47 +751,75 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %14(s1), %bb.3
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %42, %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[PHI2]], %bb.1, [[DEF2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[PHI3]]
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI1]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[FREEZE]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C6]], [[C5]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C6]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -770,20 +901,39 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -798,6 +948,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
@@ -805,9 +956,16 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C7]]
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -817,21 +975,27 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index 6f60adf2f7ab6..19e96376330a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 5f956c0f1e2e3..b6330f63f70ae 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -18,6 +18,7 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -28,13 +29,18 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -85,6 +91,7 @@ body: |
   ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -93,7 +100,9 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %10(s1), %bb.3, [[DEF]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -102,6 +111,10 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -109,14 +122,19 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[PHI]](s1), %bb.1, [[ICMP1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI1]](s1), [[C3]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C3]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -183,20 +201,28 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %35, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -210,23 +236,28 @@ body: |
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C5]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C5]]
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C6]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.2, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -308,20 +339,28 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %48, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -329,6 +368,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
@@ -341,10 +381,11 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %34(s1), %bb.5, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -358,21 +399,30 @@ body: |
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C9]]
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C4]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -481,20 +531,28 @@ body: |
   ; GFX10-NEXT:   [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %61, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -502,6 +560,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
@@ -514,10 +573,11 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %37(s1), %bb.5, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
@@ -525,6 +585,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
   ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
@@ -537,9 +598,14 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI %47(s1), %bb.7, [[C4]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
@@ -552,21 +618,30 @@ body: |
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[C11]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD3]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C11]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C11]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C12]]
+  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s1) = G_PHI [[ICMP3]](s1), %bb.6, [[C7]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -696,20 +771,39 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -724,6 +818,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
@@ -731,9 +826,16 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C7]]
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -743,21 +845,27 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
@@ -857,7 +965,11 @@ body: |
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY4]](s32), [[COPY1]]
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -870,18 +982,27 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, [[DEF]](s1), %bb.7
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %67(s1), %bb.6, %70, %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI %49(s1), %bb.6, %48(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI %35(s1), %bb.6, %34(s1), %bb.7
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI1]](s1), %17(s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -890,18 +1011,28 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
   ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
+  ; GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[COPY3]], [[COPY2]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[COPY3]], [[COPY2]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
   ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
@@ -911,17 +1042,42 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI6]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.0, [[PHI]](s1), %bb.2, [[C2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.0, [[PHI7]], %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, [[PHI1]], %bb.2, [[DEF5]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, [[PHI2]], %bb.2, [[DEF4]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]]
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]]
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI8]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY19]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_6:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_6]](s1), [[S_AND_B32_6]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY17]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   bb.0:
     successors: %bb.7(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index e34a9c89f0ad2..038866dc78185 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index d8f3ef03b5df6..26eca9cd1b34b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -17,30 +17,37 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -97,30 +104,37 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -183,20 +197,31 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x50000000), %bb.5(0x30000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %42, %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.3
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -218,8 +243,12 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[COPY2]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI3]](s32), [[COPY2]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -229,20 +258,25 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 25e2267fdee89..6384c47398fce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; REQUIRES: do-not-run-me
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a5482bd5b79a9..4caf83774bbba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11_W32 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX11_W64 %s
+; REQUIRES: do-not-run-me
 
 define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:

>From dcc8433c323b2798d5c565f4857fcfc2b15574a6 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Fri, 24 Nov 2023 15:41:47 +0100
Subject: [PATCH 4/6] GlobalISel: adapt MachineSSAUpdater for use in GlobalISel
 path

GlobalISel works with registers that could have register class,
register bank and LLT as attributes.
When initializing MachineSSAUpdater save actual register and use
it to clone attributes for all new registers instead of only using
its register class
---
 llvm/include/llvm/CodeGen/MachineSSAUpdater.h |   5 +-
 llvm/lib/CodeGen/MachineSSAUpdater.cpp        |  44 ++--
 ...-divergent-i1-phis-no-lane-mask-merging.ll | 123 ++++++-----
 ...divergent-i1-phis-no-lane-mask-merging.mir |  12 +-
 ...vergence-divergent-i1-used-outside-loop.ll | 198 ++++++++++++------
 ...ergence-divergent-i1-used-outside-loop.mir | 108 +++++-----
 .../GlobalISel/divergence-structurizer.ll     | 116 +++++++---
 .../GlobalISel/divergence-structurizer.mir    |  96 ++++-----
 .../divergence-temporal-divergent-i1.ll       |  82 +++++---
 .../divergence-temporal-divergent-i1.mir      |  20 +-
 .../divergence-temporal-divergent-reg.ll      |   2 +-
 .../GlobalISel/divergent-control-flow.ll      |  43 ++--
 12 files changed, 493 insertions(+), 356 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineSSAUpdater.h b/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
index bbd09d7d151ba..765cabdb31309 100644
--- a/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/llvm/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -40,8 +40,8 @@ class MachineSSAUpdater {
   //typedef DenseMap<MachineBasicBlock*, Register> AvailableValsTy;
   void *AV = nullptr;
 
-  /// VRC - Register class of the current virtual register.
-  const TargetRegisterClass *VRC = nullptr;
+  /// RegAttrs - current virtual register, new registers copy its attributes.
+  Register RegAttrs;
 
   /// InsertedPHIs - If this is non-null, the MachineSSAUpdater adds all PHI
   /// nodes that it creates to the vector.
@@ -62,7 +62,6 @@ class MachineSSAUpdater {
   /// Initialize - Reset this object to get ready for a new set of SSA
   /// updates.
   void Initialize(Register V);
-  void Initialize(const TargetRegisterClass *RC);
 
   /// AddAvailableValue - Indicate that a rewritten value is available at the
   /// end of the specified block with the specified value.
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 48076663ddf53..48537057e2031 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -51,17 +51,13 @@ MachineSSAUpdater::~MachineSSAUpdater() {
 
 /// Initialize - Reset this object to get ready for a new set of SSA
 /// updates.
-void MachineSSAUpdater::Initialize(const TargetRegisterClass *RC) {
+void MachineSSAUpdater::Initialize(Register V) {
   if (!AV)
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
 
-  VRC = RC;
-}
-
-void MachineSSAUpdater::Initialize(Register V) {
-  Initialize(MRI->getRegClass(V));
+  RegAttrs = V;
 }
 
 /// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
@@ -115,13 +111,12 @@ Register LookForIdenticalPHI(MachineBasicBlock *BB,
 /// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
 /// a value of the given register class at the start of the specified basic
 /// block. It returns the virtual register defined by the instruction.
-static
-MachineInstrBuilder InsertNewDef(unsigned Opcode,
-                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
-                           const TargetRegisterClass *RC,
-                           MachineRegisterInfo *MRI,
-                           const TargetInstrInfo *TII) {
-  Register NewVR = MRI->createVirtualRegister(RC);
+static MachineInstrBuilder InsertNewDef(unsigned Opcode, MachineBasicBlock *BB,
+                                        MachineBasicBlock::iterator I,
+                                        Register RegAttrs,
+                                        MachineRegisterInfo *MRI,
+                                        const TargetInstrInfo *TII) {
+  Register NewVR = MRI->cloneVirtualRegister(RegAttrs);
   return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
 }
 
@@ -158,9 +153,9 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
     if (ExistingValueOnly)
       return Register();
     // Insert an implicit_def to represent an undef value.
-    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
-                                        BB, BB->getFirstTerminator(),
-                                        VRC, MRI, TII);
+    MachineInstr *NewDef =
+        InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstTerminator(),
+                     RegAttrs, MRI, TII);
     return NewDef->getOperand(0).getReg();
   }
 
@@ -197,8 +192,8 @@ Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB,
 
   // Otherwise, we do need a PHI: insert one now.
   MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
-  MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
-                                                 Loc, VRC, MRI, TII);
+  MachineInstrBuilder InsertedPHI =
+      InsertNewDef(TargetOpcode::PHI, BB, Loc, RegAttrs, MRI, TII);
 
   // Fill in all the predecessors of the PHI.
   for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
@@ -300,10 +295,9 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
   static Register GetUndefVal(MachineBasicBlock *BB,
                               MachineSSAUpdater *Updater) {
     // Insert an implicit_def to represent an undef value.
-    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
-                                        BB, BB->getFirstNonPHI(),
-                                        Updater->VRC, Updater->MRI,
-                                        Updater->TII);
+    MachineInstr *NewDef =
+        InsertNewDef(TargetOpcode::IMPLICIT_DEF, BB, BB->getFirstNonPHI(),
+                     Updater->RegAttrs, Updater->MRI, Updater->TII);
     return NewDef->getOperand(0).getReg();
   }
 
@@ -312,9 +306,9 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
   static Register CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
                                  MachineSSAUpdater *Updater) {
     MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
-    MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
-                                     Updater->VRC, Updater->MRI,
-                                     Updater->TII);
+    MachineInstr *PHI =
+        InsertNewDef(TargetOpcode::PHI, BB, Loc, Updater->RegAttrs,
+                     Updater->MRI, Updater->TII);
     return PHI->getOperand(0).getReg();
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index ce0547a991703..6ec152b1f55e3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 ; Divergent phis that don't require lowering using lane mask merging
 
@@ -65,15 +64,16 @@ exit:
 define amdgpu_ps void @divergent_i1_phi_uniform_branch_simple(i32 addrspace(1)* %out, i32 %tid, i32 inreg %cond) {
 ; GFX10-LABEL: divergent_i1_phi_uniform_branch_simple:
 ; GFX10:       ; %bb.0: ; %A
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, 6, v2
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB1_2
-; GFX10-NEXT:  ; %bb.1:
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 6, v2
-; GFX10-NEXT:    s_branch .LBB1_3
-; GFX10-NEXT:  .LBB1_2: ; %B
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
-; GFX10-NEXT:  .LBB1_3: ; %exit
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
+; GFX10-NEXT:    s_cbranch_scc1 .LBB1_2
+; GFX10-NEXT:  ; %bb.1: ; %B
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s1, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s1, s0, s1
+; GFX10-NEXT:  .LBB1_2: ; %exit
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s1
 ; GFX10-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 A:
@@ -99,23 +99,27 @@ define void @divergent_i1_phi_used_inside_loop(float %val, ptr %addr) {
 ; GFX10-LABEL: divergent_i1_phi_used_inside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:  .LBB2_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v3
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -142,44 +146,49 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v8, 0x3e8
-; GFX10-NEXT:    v_mov_b32_e32 v9, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3e8
+; GFX10-NEXT:    v_mov_b32_e32 v8, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_branch .LBB3_2
 ; GFX10-NEXT:  .LBB3_1: ; %loop_body
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v10, v9
-; GFX10-NEXT:    v_xor_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v10, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v9, v8
+; GFX10-NEXT:    s_xor_b32 s4, s4, -1
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_2: ; %loop_start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v9
-; GFX10-NEXT:    s_mov_b32 s5, 1
+; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
+; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
+; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v9
 ; GFX10-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX10-NEXT:  ; %bb.3: ; %else
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    flat_store_dword v[6:7], v8
+; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    flat_store_dword v[6:7], v1
 ; GFX10-NEXT:  .LBB3_4: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    s_xor_b32 s5, s5, 1
-; GFX10-NEXT:    s_and_b32 s5, s5, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX10-NEXT:    s_xor_b32 s7, s7, 1
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10-NEXT:  ; %bb.5: ; %if
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX10-NEXT:    flat_store_dword v[4:5], v8
+; GFX10-NEXT:    flat_store_dword v[4:5], v1
 ; GFX10-NEXT:    s_branch .LBB3_1
 ; GFX10-NEXT:  .LBB3_6: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -219,8 +228,8 @@ exit:
 define dllexport amdgpu_cs void @_amdgpu_cs_main( i32 inreg noundef %.userdata0, <3 x i32> inreg noundef %.WorkgroupId, <3 x i32> noundef %.LocalInvocationId) #0 {
 ; GFX10-LABEL: _amdgpu_cs_main:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_mov_b32 s12, 0
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
+; GFX10-NEXT:    s_mov_b32 s12, 0
 ; GFX10-NEXT:    s_mov_b32 s13, -1
 ; GFX10-NEXT:    s_mov_b32 s2, s0
 ; GFX10-NEXT:    s_and_b64 s[4:5], s[4:5], s[12:13]
@@ -228,40 +237,38 @@ define dllexport amdgpu_cs void @_amdgpu_cs_main( i32 inreg noundef %.userdata0,
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
-; GFX10-NEXT:    s_mov_b32 s2, 1
-; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v2, -1, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 2, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 1, v2
+; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
+; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v1, v1, s[4:7], 0 offen
+; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    ; implicit-def: $vgpr3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    v_mov_b32_e32 v4, s12
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s12
+; GFX10-NEXT:    v_mov_b32_e32 v4, s12
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    buffer_load_dword v5, v4, s[4:7], 0 offen
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, -1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 4, v4
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
+; GFX10-NEXT:    buffer_load_dword v5, v3, s[4:7], 0 offen
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, 4, v3
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v5, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v5, v4
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v1
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT:    s_mov_b32 s13, 0
 ; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
-; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:  .LBB4_4: ; %Flow
-; GFX10-NEXT:    s_and_b32 s2, s2, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX10-NEXT:    s_cbranch_scc0 .LBB4_6
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s13
+; GFX10-NEXT:    s_cbranch_vccz .LBB4_6
 ; GFX10-NEXT:  ; %bb.5: ; %.19
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v3, 2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index b6864a00fba11..aedc087d27a19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -143,8 +143,8 @@ body: |
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
@@ -202,11 +202,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -297,11 +297,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 615a3787f3cd6..6b2843e3a8405 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
@@ -17,22 +16,29 @@ define void @divergent_i1_phi_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
+; GFX10-NEXT:    s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s6, s5, s6
+; GFX10-NEXT:    ; implicit-def: $sgpr5
 ; GFX10-NEXT:  .LBB0_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v6, v1
-; GFX10-NEXT:    v_mov_b32_e32 v5, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v1
+; GFX10-NEXT:    s_xor_b32 s7, s6, -1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v0
-; GFX10-NEXT:    v_xor_b32_e32 v4, 1, v5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 s8, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, s7
+; GFX10-NEXT:    s_andn2_b32 s5, s5, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s7, s8, s7
+; GFX10-NEXT:    s_or_b32 s5, s5, s6
+; GFX10-NEXT:    s_mov_b32 s6, s7
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -60,8 +66,11 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s4, -1
-; GFX10-NEXT:    v_mov_b32_e32 v5, 1
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s4
+; GFX10-NEXT:    s_andn2_b32 s5, s4, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s4, s5, s4
 ; GFX10-NEXT:    s_branch .LBB1_2
 ; GFX10-NEXT:  .LBB1_1: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -70,20 +79,26 @@ define void @divergent_i1_phi_used_outside_loop_larger_loop_body(float %val, ptr
 ; GFX10-NEXT:    v_add_co_u32 v1, s4, v1, 4
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s4, 0, v2, s4
 ; GFX10-NEXT:    v_cmp_le_i32_e32 vcc_lo, 10, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    s_andn2_b32 s7, s5, exec_lo
+; GFX10-NEXT:    s_and_b32 s8, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s4, s7, s8
 ; GFX10-NEXT:    s_cbranch_vccz .LBB1_4
 ; GFX10-NEXT:  .LBB1_2: ; %loop.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, v5
-; GFX10-NEXT:    s_mov_b32 s6, s5
+; GFX10-NEXT:    s_mov_b32 s5, s4
+; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB1_1
 ; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
 ; GFX10-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GFX10-NEXT:    global_load_dword v5, v[1:2], off
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s6, 0, v5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v5
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
 ; GFX10-NEXT:    s_branch .LBB1_1
 ; GFX10-NEXT:  .LBB1_4: ; %exit
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s5
@@ -123,23 +138,28 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr6
+; GFX10-NEXT:    v_mov_b32_e32 v1, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX10-NEXT:  .LBB2_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
-; GFX10-NEXT:    v_xor_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v5, v0
+; GFX10-NEXT:    s_xor_b32 s7, vcc_lo, -1
+; GFX10-NEXT:    s_or_b32 s5, s4, s5
+; GFX10-NEXT:    v_mov_b32_e32 v4, s7
+; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s7
+; GFX10-NEXT:    s_or_b32 s6, s4, s6
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -177,44 +197,59 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, -1
+; GFX10-NEXT:    s_mov_b32 s6, 1
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
+; GFX10-NEXT:    ; implicit-def: $sgpr7
+; GFX10-NEXT:    ; implicit-def: $sgpr8
 ; GFX10-NEXT:    s_branch .LBB3_3
 ; GFX10-NEXT:  .LBB3_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT:    s_xor_b32 s7, s7, 1
-; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
-; GFX10-NEXT:    s_or_b32 s5, s6, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT:    s_xor_b32 s9, s8, -1
+; GFX10-NEXT:    s_and_b32 s10, exec_lo, s7
+; GFX10-NEXT:    s_or_b32 s5, s10, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s9, exec_lo, s9
+; GFX10-NEXT:    s_or_b32 s6, s6, s9
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_5
 ; GFX10-NEXT:  .LBB3_3: ; %loop.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GFX10-NEXT:    s_mov_b32 s6, -1
-; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    s_andn2_b32 s8, s8, exec_lo
+; GFX10-NEXT:    s_and_b32 s9, exec_lo, -1
+; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
+; GFX10-NEXT:    s_or_b32 s8, s8, s9
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
+; GFX10-NEXT:    s_or_b32 s7, s7, s9
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
 ; GFX10-NEXT:    global_load_dword v6, v[6:7], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    s_and_saveexec_b32 s8, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s9, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.4: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v5
-; GFX10-NEXT:    v_cmp_lt_i32_e64 s6, v5, v0
-; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    s_andn2_b32 s8, s8, exec_lo
+; GFX10-NEXT:    s_and_b32 s10, exec_lo, 0
+; GFX10-NEXT:    s_andn2_b32 s7, s7, exec_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v6
+; GFX10-NEXT:    s_and_b32 s11, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s8, s8, s10
+; GFX10-NEXT:    s_or_b32 s7, s7, s11
 ; GFX10-NEXT:    s_branch .LBB3_2
 ; GFX10-NEXT:  .LBB3_5: ; %loop.exit.guard
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_and_b32 s5, 1, s7
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s6, 0, s5
+; GFX10-NEXT:    s_andn2_b32 s5, -1, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 s6, s5, s6
 ; GFX10-NEXT:  .LBB3_6: ; %Flow1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
@@ -266,21 +301,24 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX10-NEXT:    s_branch .LBB4_2
 ; GFX10-NEXT:  .LBB4_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
-; GFX10-NEXT:    s_and_b32 s4, 1, s6
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, s4
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s7
 ; GFX10-NEXT:    s_or_b32 s5, s4, s5
+; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s6, s4, s6
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_6
 ; GFX10-NEXT:  .LBB4_2: ; %cond.block.0
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v4
-; GFX10-NEXT:    s_and_saveexec_b32 s6, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_4
 ; GFX10-NEXT:  ; %bb.3: ; %if.block.0
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
@@ -292,20 +330,22 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:  .LBB4_4: ; %loop.break.block
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
-; GFX10-NEXT:    s_mov_b32 s6, 1
+; GFX10-NEXT:    s_mov_b32 s7, 1
 ; GFX10-NEXT:    ; implicit-def: $vgpr5
-; GFX10-NEXT:    s_and_saveexec_b32 s7, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
 ; GFX10-NEXT:  ; %bb.5: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB4_2 Depth=1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v4
-; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, 0
+; GFX10-NEXT:    s_or_b32 s7, s4, s7
 ; GFX10-NEXT:    s_branch .LBB4_1
 ; GFX10-NEXT:  .LBB4_6: ; %cond.block.1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s6
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX10-NEXT:  ; %bb.7: ; %if.block.1
 ; GFX10-NEXT:    global_store_dword v[6:7], v4, off
@@ -370,34 +410,44 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
 ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    v_mov_b32_e32 v6, 1
+; GFX10-NEXT:    s_mov_b32 s3, 1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr2
 ; GFX10-NEXT:    s_branch .LBB5_2
 ; GFX10-NEXT:  .LBB5_1: ; %loop.cond
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_cmp_lt_i32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s1
-; GFX10-NEXT:    v_mov_b32_e32 v5, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v5
 ; GFX10-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX10-NEXT:  .LBB5_2: ; %loop.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v6
-; GFX10-NEXT:    s_and_saveexec_b32 s2, s1
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_saveexec_b32 s4, s3
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_1
 ; GFX10-NEXT:  ; %bb.3: ; %is.eq.zero
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v1, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v2, v7, vcc_lo
 ; GFX10-NEXT:    global_load_dword v6, v[6:7], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, 0, v6
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s2, s2, s3
+; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    s_branch .LBB5_1
 ; GFX10-NEXT:  .LBB5_4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s0
@@ -436,40 +486,52 @@ define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr2
+; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB6_2
 ; GFX10-NEXT:  .LBB6_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB6_4
 ; GFX10-NEXT:  .LBB6_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB6_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
-; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
+; GFX10-NEXT:    s_or_b32 s2, s2, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 5d88d3c1fc6cd..507edbd961822 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -20,8 +20,8 @@ body: |
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
@@ -30,14 +30,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]]
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -123,8 +123,8 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
@@ -133,14 +133,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %41, %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %41(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
@@ -162,8 +162,8 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
@@ -260,11 +260,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -365,8 +365,8 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
@@ -374,14 +374,14 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.1, %61, %bb.7
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.1, %48, %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
@@ -433,11 +433,11 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
@@ -574,10 +574,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
@@ -620,9 +620,9 @@ body: |
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
@@ -757,14 +757,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %42, %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
@@ -792,10 +792,10 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[PHI2]], %bb.1, [[DEF2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
@@ -908,14 +908,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
@@ -975,11 +975,11 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index 19e96376330a2..a7f24e13c60fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -12,7 +12,10 @@ define amdgpu_ps void @divergent_i1_phi_if_then(i32 addrspace(1)* %out, i32 %tid
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:  ; %bb.1: ; %B
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 1, v2
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:  ; %bb.2: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
@@ -40,16 +43,23 @@ define amdgpu_ps void @divergent_i1_phi_if_else(i32 addrspace(1)* %out, i32 %tid
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, s0
+; GFX10-NEXT:    ; implicit-def: $sgpr0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-NEXT:  ; %bb.1: ; %B
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s0, 2, v2
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s2, exec_lo
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:  ; %bb.2: ; %Flow
 ; GFX10-NEXT:    s_andn2_saveexec_b32 s1, s1
 ; GFX10-NEXT:  ; %bb.3: ; %A
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, 1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 1, v2
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s0, s0, s2
 ; GFX10-NEXT:  ; %bb.4: ; %exit
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 2, 1, s0
@@ -96,36 +106,42 @@ define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-LABEL: loop_with_1break:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX10-NEXT:    s_branch .LBB2_2
 ; GFX10-NEXT:  .LBB2_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX10-NEXT:  .LBB2_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s2
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 2, v[4:5]
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v2, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v3, v6, vcc_lo
 ; GFX10-NEXT:    global_load_dword v7, v[7:8], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v7
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB2_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v5, vcc_lo, v0, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v6, vcc_lo, v1, v6, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v4
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v4
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v4
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
 ; GFX10-NEXT:    global_load_dword v7, v[5:6], off
 ; GFX10-NEXT:    v_mov_b32_e32 v4, v8
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, 1, v7
 ; GFX10-NEXT:    global_store_dword v[5:6], v7, off
@@ -159,35 +175,42 @@ define amdgpu_cs void @loop_with_2breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-LABEL: loop_with_2breaks:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB3_3
 ; GFX10-NEXT:  .LBB3_1: ; %Flow3
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:  .LBB3_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_3: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s2
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX10-NEXT:  ; %bb.4: ; %B
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
@@ -198,9 +221,12 @@ define amdgpu_cs void @loop_with_2breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
@@ -240,38 +266,48 @@ define amdgpu_cs void @loop_with_3breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-LABEL: loop_with_3breaks:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    v_mov_b32_e32 v8, s0
 ; GFX10-NEXT:    s_branch .LBB4_4
 ; GFX10-NEXT:  .LBB4_1: ; %Flow5
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_andn2_b32 s4, -1, exec_lo
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s4, s4, s5
 ; GFX10-NEXT:  .LBB4_2: ; %Flow4
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s3, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s1, s1, s3
 ; GFX10-NEXT:  .LBB4_3: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, s1
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_8
 ; GFX10-NEXT:  .LBB4_4: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
-; GFX10-NEXT:    s_mov_b32 s2, -1
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s2, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s2
 ; GFX10-NEXT:    v_lshlrev_b64 v[9:10], 2, v[8:9]
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v2, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v3, v10, vcc_lo
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX10-NEXT:  ; %bb.5: ; %B
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -281,6 +317,7 @@ define amdgpu_cs void @loop_with_3breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s5, 1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -291,9 +328,12 @@ define amdgpu_cs void @loop_with_3breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v0, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v1, v10, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v12, 1, v8
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v8
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v8
+; GFX10-NEXT:    s_andn2_b32 s5, -1, exec_lo
 ; GFX10-NEXT:    global_load_dword v11, v[9:10], off
 ; GFX10-NEXT:    v_mov_b32_e32 v8, v12
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s5, s5, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, 1, v11
 ; GFX10-NEXT:    global_store_dword v[9:10], v11, off
@@ -343,40 +383,52 @@ define amdgpu_cs void @loop_with_div_break_with_body(i32 addrspace(1)* %x, i32 a
 ; GFX10-LABEL: loop_with_div_break_with_body:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
+; GFX10-NEXT:    ; implicit-def: $sgpr2
+; GFX10-NEXT:    ; implicit-def: $sgpr3
 ; GFX10-NEXT:    v_mov_b32_e32 v6, s0
 ; GFX10-NEXT:    s_branch .LBB5_2
 ; GFX10-NEXT:  .LBB5_1: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s1
-; GFX10-NEXT:    s_and_b32 s1, exec_lo, s2
-; GFX10-NEXT:    s_or_b32 s0, s1, s0
-; GFX10-NEXT:    s_and_b32 s1, 1, s3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, s1
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s2
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s3
+; GFX10-NEXT:    s_or_b32 s1, s1, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s0
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_4
 ; GFX10-NEXT:  .LBB5_2: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
-; GFX10-NEXT:    s_mov_b32 s2, -1
-; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, -1
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[7:8], 2, v[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v2, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v3, v8, vcc_lo
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
-; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
+; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB5_1
 ; GFX10-NEXT:  ; %bb.3: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB5_2 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v7, vcc_lo, v0, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, v1, v8, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, 1, v6
-; GFX10-NEXT:    v_cmp_gt_u32_e64 s2, 0x64, v6
-; GFX10-NEXT:    s_mov_b32 s3, 0
+; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 0x64, v6
+; GFX10-NEXT:    s_andn2_b32 s3, s3, exec_lo
 ; GFX10-NEXT:    global_load_dword v9, v[7:8], off
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v6, v10
+; GFX10-NEXT:    s_andn2_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s3, s3, s5
+; GFX10-NEXT:    s_or_b32 s2, s2, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v9
 ; GFX10-NEXT:    global_store_dword v[7:8], v9, off
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index b6330f63f70ae..e242cfafed7cb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -35,8 +35,8 @@ body: |
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
@@ -100,8 +100,8 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
@@ -129,8 +129,8 @@ body: |
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
@@ -206,10 +206,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %35, %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %35(s1), %bb.3
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -247,9 +247,9 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -344,10 +344,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %48, %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %48(s1), %bb.3
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -381,9 +381,9 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -410,9 +410,9 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
@@ -536,10 +536,10 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %61, %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %61(s1), %bb.3
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
@@ -573,9 +573,9 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -598,9 +598,9 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
   ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
@@ -629,9 +629,9 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
@@ -778,14 +778,14 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, %56(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %43(s1), %bb.5
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
@@ -845,11 +845,11 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
@@ -982,12 +982,12 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %67(s1), %bb.6, %70, %bb.7
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI %49(s1), %bb.6, %48(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI %35(s1), %bb.6, %34(s1), %bb.7
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %70(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
   ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
@@ -1057,16 +1057,16 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.0, [[PHI7]], %bb.2, [[S_OR_B32_1]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, [[PHI1]], %bb.2, [[DEF5]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, [[PHI2]], %bb.2, [[DEF4]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]]
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 038866dc78185..97face627cb80 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,29 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 1
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:  .LBB0_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v6, v3
-; GFX10-NEXT:    v_mov_b32_e32 v5, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, 1, v3
-; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v6, v0
-; GFX10-NEXT:    v_xor_b32_e32 v4, 1, v5
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v3
+; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -49,23 +51,27 @@ define void @temporal_divergent_i1_non_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_non_phi:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v4, s4
+; GFX10-NEXT:    v_mov_b32_e32 v4, s5
+; GFX10-NEXT:    ; implicit-def: $sgpr6
 ; GFX10-NEXT:  .LBB1_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_xor_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v4
+; GFX10-NEXT:    v_and_b32_e32 v6, 1, v3
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v5, v0
-; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
+; GFX10-NEXT:    s_or_b32 s6, s6, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[1:2], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -96,30 +102,41 @@ define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 %x.size, i32 a
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s4
+; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    s_branch .LBB2_3
 ; GFX10-NEXT:  .LBB2_1: ; %loop.body
 ; GFX10-NEXT:    ; in Loop: Header=BB2_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, v1, v7, vcc_lo
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 1, v5
-; GFX10-NEXT:    v_cmp_lt_u32_e64 s0, v5, v2
-; GFX10-NEXT:    s_mov_b32 s1, 0
+; GFX10-NEXT:    v_cmp_lt_u32_e32 vcc_lo, v5, v2
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
 ; GFX10-NEXT:    global_load_dword v8, v[6:7], off
+; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v5, v9
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, vcc_lo
+; GFX10-NEXT:    s_or_b32 s1, s1, s6
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
 ; GFX10-NEXT:    global_store_dword v[6:7], v8, off
 ; GFX10-NEXT:  .LBB2_2: ; %Flow
 ; GFX10-NEXT:    ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT:    s_and_b32 s0, exec_lo, s0
-; GFX10-NEXT:    s_or_b32 s4, s0, s4
-; GFX10-NEXT:    s_and_b32 s0, 1, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
+; GFX10-NEXT:    s_and_b32 s5, 1, s5
+; GFX10-NEXT:    s_and_b32 s6, exec_lo, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s5, 0, s5
+; GFX10-NEXT:    s_or_b32 s4, s6, s4
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s0, s0, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB2_5
 ; GFX10-NEXT:  .LBB2_3: ; %A
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; GFX10-NEXT:    s_andn2_b32 s1, s1, exec_lo
+; GFX10-NEXT:    s_and_b32 s5, exec_lo, -1
+; GFX10-NEXT:    s_or_b32 s1, s1, s5
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], 2, v[5:6]
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v3, v6
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, v4, v7, vcc_lo
@@ -128,8 +145,7 @@ define amdgpu_cs void @loop_with_1break(i32 addrspace(1)* %x, i32 %x.size, i32 a
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.4: ; in Loop: Header=BB2_3 Depth=1
-; GFX10-NEXT:    s_mov_b32 s0, -1
-; GFX10-NEXT:    s_mov_b32 s1, 1
+; GFX10-NEXT:    s_mov_b32 s5, 1
 ; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_branch .LBB2_2
 ; GFX10-NEXT:  .LBB2_5: ; %loop.exit.guard
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index 26eca9cd1b34b..a00be03d9c86b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -22,12 +22,12 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
@@ -109,11 +109,11 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
   ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
@@ -203,12 +203,12 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x50000000), %bb.5(0x30000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %42, %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %42(s1), %bb.5
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32)
@@ -258,11 +258,11 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]](s1)
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index d065f228a4e4a..5ee4cfedc7840 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i32(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 6384c47398fce..c25b0f2128266 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
@@ -203,26 +202,34 @@ define amdgpu_kernel void @break_loop(i32 %arg) {
 ; CHECK-NEXT:    ; implicit-def: $vgpr1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    v_subrev_u32_e32 v0, s2, v0
-; CHECK-NEXT:    s_branch .LBB5_2
-; CHECK-NEXT:  .LBB5_1: ; %Flow
-; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
-; CHECK-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; CHECK-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; CHECK-NEXT:    s_branch .LBB5_3
+; CHECK-NEXT:  .LBB5_1: ; %bb4
+; CHECK-NEXT:    ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT:    global_load_dword v2, v[0:1], off glc
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    v_cmp_ge_i32_e32 vcc, v0, v2
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, vcc
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT:  .LBB5_2: ; %Flow
+; CHECK-NEXT:    ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, s[2:3]
+; CHECK-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
 ; CHECK-NEXT:    s_andn2_b64 exec, exec, s[0:1]
-; CHECK-NEXT:    s_cbranch_execz .LBB5_4
-; CHECK-NEXT:  .LBB5_2: ; %bb1
+; CHECK-NEXT:    s_cbranch_execz .LBB5_5
+; CHECK-NEXT:  .LBB5_3: ; %bb1
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_u32_e32 v1, 1, v1
+; CHECK-NEXT:    s_andn2_b64 s[2:3], s[2:3], exec
+; CHECK-NEXT:    s_and_b64 s[4:5], exec, -1
 ; CHECK-NEXT:    v_cmp_le_i32_e32 vcc, 0, v1
-; CHECK-NEXT:    s_mov_b64 s[2:3], -1
-; CHECK-NEXT:    s_cbranch_vccnz .LBB5_1
-; CHECK-NEXT:  ; %bb.3: ; %bb4
-; CHECK-NEXT:    ; in Loop: Header=BB5_2 Depth=1
-; CHECK-NEXT:    global_load_dword v2, v[0:1], off glc
-; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_cmp_ge_i32_e64 s[2:3], v0, v2
-; CHECK-NEXT:    s_branch .LBB5_1
-; CHECK-NEXT:  .LBB5_4: ; %bb9
+; CHECK-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; CHECK-NEXT:    s_cbranch_vccz .LBB5_1
+; CHECK-NEXT:  ; %bb.4: ; in Loop: Header=BB5_3 Depth=1
+; CHECK-NEXT:    ; implicit-def: $vgpr1
+; CHECK-NEXT:    s_branch .LBB5_2
+; CHECK-NEXT:  .LBB5_5: ; %bb9
 ; CHECK-NEXT:    s_endpgm
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()

>From d344794818b9b94508ed16d8ab47f6c2b0d5f4bd Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Fri, 24 Nov 2023 15:49:30 +0100
Subject: [PATCH 5/6] AMDGPU/GlobalISelDivergenceLowering: constrain incoming
 registers

Implement constrainIncomingRegisterTakenAsIs by constraining incoming
registers taken as-is with lane mask attributes. Most often they only
have S1 LLT.
This is final step in having PHI instructions created in this pass to
be fully instruction-selected.
---
 .../AMDGPUGlobalISelDivergenceLowering.cpp    |  11 ++
 ...-divergent-i1-phis-no-lane-mask-merging.ll |  24 +--
 ...divergent-i1-phis-no-lane-mask-merging.mir |  66 ++++---
 ...vergence-divergent-i1-used-outside-loop.ll |  36 ++--
 ...ergence-divergent-i1-used-outside-loop.mir | 177 +++++++++---------
 .../GlobalISel/divergence-structurizer.ll     |   2 +-
 .../GlobalISel/divergence-structurizer.mir    | 161 ++++++++--------
 7 files changed, 252 insertions(+), 225 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index 24be01fc1b0f3..a955a08b0a377 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPU.h"
 #include "SILowerI1Copies.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/InitializePasses.h"
@@ -161,8 +162,18 @@ void DivergenceLoweringHelper::buildMergeLaneMasks(
   return;
 }
 
+// GlobalISel has to constrain S1 incoming taken as-is with lane mask register
+// class. Insert a copy of Incoming.Reg to new lane mask inside Incoming.Block,
+// Incoming.Reg becomes that new lane mask.
 void DivergenceLoweringHelper::constrainIncomingRegisterTakenAsIs(
     Incoming &In) {
+  MachineIRBuilder B(*MF);
+  B.setInsertPt(*In.Block, In.Block->getFirstTerminator());
+
+  auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
+  MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
+  In.Reg = Copy.getReg(0);
+
   return;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 6ec152b1f55e3..4db4fc5fd9ec9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Divergent phis that don't require lowering using lane mask merging
 
@@ -145,32 +145,28 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10-LABEL: divergent_i1_phi_used_inside_loop_bigger_loop_body:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0x3e8
-; GFX10-NEXT:    v_mov_b32_e32 v8, s5
+; GFX10-NEXT:    v_mov_b32_e32 v8, s4
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc_lo
 ; GFX10-NEXT:    s_branch .LBB3_2
 ; GFX10-NEXT:  .LBB3_1: ; %loop_body
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v9, v8
-; GFX10-NEXT:    s_xor_b32 s4, s4, -1
+; GFX10-NEXT:    s_xor_b32 s5, s5, -1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v9, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
-; GFX10-NEXT:    s_and_b32 s4, exec_lo, s4
-; GFX10-NEXT:    s_or_b32 s6, s6, s4
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  .LBB3_2: ; %loop_start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
 ; GFX10-NEXT:    v_cmp_ge_i32_e32 vcc_lo, 0x3e8, v8
 ; GFX10-NEXT:    s_mov_b32 s7, 1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, 0, v9
 ; GFX10-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX10-NEXT:  ; %bb.3: ; %else
 ; GFX10-NEXT:    ; in Loop: Header=BB3_2 Depth=1
@@ -187,7 +183,7 @@ define void @divergent_i1_phi_used_inside_loop_bigger_loop_body(float %val, floa
 ; GFX10-NEXT:    flat_store_dword v[4:5], v1
 ; GFX10-NEXT:    s_branch .LBB3_1
 ; GFX10-NEXT:  .LBB3_6: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index aedc087d27a19..ce0ceb243b9c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -33,6 +33,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -46,7 +47,8 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY6]](s1), %bb.0, %20(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -54,12 +56,13 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C3]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -126,9 +129,10 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -137,17 +141,17 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -292,19 +296,21 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.0, %39(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI3]](s32), [[C3]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.4
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -336,26 +342,27 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[COPY10]], [[C8]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY13]](s1), [[C11]], [[C10]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -475,6 +482,7 @@ body: |
   ; GFX10-NEXT:   [[TRUNC1:%[0-9]+]]:_(s1) = G_TRUNC [[AND1]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[TRUNC1]], [[C5]]
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.2
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -487,9 +495,10 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
-  ; GFX10-NEXT:   G_BRCOND [[PHI1]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY3]](s1), %bb.0, %58(s1), %bb.4
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   G_BRCOND [[COPY4]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -517,6 +526,7 @@ body: |
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[PHI5]](s32), [[AMDGPU_BUFFER_LOAD]]
   ; GFX10-NEXT:   [[OR1:%[0-9]+]]:_(s1) = G_OR [[ICMP]], [[ICMP2]]
   ; GFX10-NEXT:   [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C10]](s1)
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -527,7 +537,7 @@ body: |
   ; GFX10-NEXT:   [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[C11]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI]](s32), %bb.2, [[OR2]](s32), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.2, [[OR2]](s32), %bb.5
   ; GFX10-NEXT:   [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[LOAD]](<8 x s32>)
   ; GFX10-NEXT:   [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY1]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 6b2843e3a8405..31745c27ad653 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
@@ -137,28 +137,24 @@ define void @divergent_i1_xor_used_outside_loop(float %val, float %pre.cond.val,
 ; GFX10-LABEL: divergent_i1_xor_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_cmp_lt_f32_e32 vcc_lo, 1.0, v1
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    v_cmp_lt_f32_e64 s5, 1.0, v1
+; GFX10-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX10-NEXT:    ; implicit-def: $sgpr6
-; GFX10-NEXT:    v_mov_b32_e32 v1, s5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc_lo
 ; GFX10-NEXT:  .LBB2_1: ; %loop
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v1
+; GFX10-NEXT:    s_xor_b32 s5, s5, -1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, 1, v1
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v4
-; GFX10-NEXT:    v_cmp_gt_f32_e64 s4, v5, v0
-; GFX10-NEXT:    s_xor_b32 s7, vcc_lo, -1
-; GFX10-NEXT:    s_or_b32 s5, s4, s5
-; GFX10-NEXT:    v_mov_b32_e32 v4, s7
-; GFX10-NEXT:    s_andn2_b32 s4, s6, exec_lo
-; GFX10-NEXT:    s_and_b32 s6, exec_lo, s7
-; GFX10-NEXT:    s_or_b32 s6, s4, s6
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, v4, v0
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 s6, s6, exec_lo
+; GFX10-NEXT:    s_and_b32 s7, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 s6, s6, s7
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
 ; GFX10-NEXT:  ; %bb.2: ; %exit
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s6
 ; GFX10-NEXT:    flat_store_dword v[2:3], v0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
@@ -197,7 +193,7 @@ define void @divergent_i1_xor_used_outside_loop_larger_loop_body(i32 %num.elts,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:    s_mov_b32 s6, 1
+; GFX10-NEXT:    s_mov_b32 s6, -1
 ; GFX10-NEXT:    s_and_saveexec_b32 s4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_6
 ; GFX10-NEXT:  ; %bb.1: ; %loop.start.preheader
@@ -332,7 +328,7 @@ define void @divergent_i1_icmp_used_outside_loop(i32 %v0, i32 %v1, ptr addrspace
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s4, v1, v4
-; GFX10-NEXT:    s_mov_b32 s7, 1
+; GFX10-NEXT:    s_mov_b32 s7, -1
 ; GFX10-NEXT:    ; implicit-def: $vgpr5
 ; GFX10-NEXT:    s_and_saveexec_b32 s8, s4
 ; GFX10-NEXT:    s_cbranch_execz .LBB4_1
@@ -410,7 +406,7 @@ define amdgpu_ps void @divergent_i1_freeze_used_outside_loop(i32 %n, ptr addrspa
 ; GFX10-LABEL: divergent_i1_freeze_used_outside_loop:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_mov_b32 s0, 0
-; GFX10-NEXT:    s_mov_b32 s3, 1
+; GFX10-NEXT:    s_mov_b32 s3, -1
 ; GFX10-NEXT:    v_mov_b32_e32 v5, s0
 ; GFX10-NEXT:    ; implicit-def: $sgpr1
 ; GFX10-NEXT:    ; implicit-def: $sgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 507edbd961822..d4a5903db8adb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -175,14 +175,15 @@ body: |
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.1
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[C7]], [[C6]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C7]], [[C6]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -255,37 +256,40 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %27(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY6]], [[C2]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -349,7 +353,8 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY5]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -365,26 +370,26 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %40(s1), %bb.8
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %61(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %48(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.1, %73(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.1, %62(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.1, %49(s1), %bb.7
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI5]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
@@ -392,14 +397,14 @@ body: |
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -407,16 +412,16 @@ body: |
   ; GFX10-NEXT:   successors: %bb.7(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]]
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY14]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
@@ -436,15 +441,15 @@ body: |
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY18]], [[C9]]
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY17]](s1), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY19]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
@@ -453,11 +458,11 @@ body: |
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY19]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY20]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
@@ -574,7 +579,7 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[DEF1]](s1), %bb.0, %39(s1), %bb.6
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
   ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
@@ -600,9 +605,10 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -610,21 +616,21 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[COPY8]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
@@ -636,9 +642,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY11]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY12]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
@@ -751,26 +757,27 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %42(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, %54(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %43(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY5]](s1), %bb.0, %33(s1), %bb.3
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[COPY8]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY8]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
@@ -783,10 +790,10 @@ body: |
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -794,32 +801,32 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[PHI2]](s1), %bb.1, [[DEF2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY12]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]]
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C6]], [[C5]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY16]](s1), [[C6]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index a7f24e13c60fc..bfa5c9cddde84 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 ; REQUIRES: do-not-run-me
 
 ; Simples case, if - then, that requires lane mask merging,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index e242cfafed7cb..d6f225e3ab978 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -18,9 +18,10 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -29,18 +30,18 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY7]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -91,18 +92,20 @@ body: |
   ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[COPY4]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = PHI [[COPY4]](s1), %bb.0, %20(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[COPY6]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -111,9 +114,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -122,19 +125,19 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI [[COPY7]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C3]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C3]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -368,13 +371,14 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
   ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[COPY9]](s1)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -383,9 +387,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY11]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -402,21 +406,21 @@ body: |
   ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY9]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY13]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -560,13 +564,14 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
   ; GFX10-NEXT:   [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[COPY11]](s1)
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -575,9 +580,9 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
@@ -585,26 +590,27 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
   ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
   ; GFX10-NEXT:   [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD2]](s32), [[C9]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP2]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32(s1) = PHI [[COPY11]](s1), %bb.2, %72(s1), %bb.7
   ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -621,21 +627,21 @@ body: |
   ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C11]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
   ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY15]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[COPY14]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[COPY19]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -970,6 +976,7 @@ body: |
   ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -982,19 +989,19 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %70(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI %67(s1), %bb.6, %71(s1), %bb.7
   ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = PHI %49(s1), %bb.6, %48(s1), %bb.7
   ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32(s1) = PHI %35(s1), %bb.6, %34(s1), %bb.7
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY10]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY9]](s1), %17(s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -1011,28 +1018,28 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
   ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
   ; GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[COPY3]], [[COPY2]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[COPY3]], [[COPY2]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
   ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
@@ -1042,14 +1049,14 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.2
@@ -1057,27 +1064,27 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = PHI [[COPY7]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32(s1) = PHI [[DEF3]](s1), %bb.0, [[PHI7]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32(s1) = PHI [[DEF2]](s1), %bb.0, [[PHI1]](s1), %bb.2, [[DEF5]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32(s1) = PHI [[DEF1]](s1), %bb.0, [[PHI2]](s1), %bb.2, [[DEF4]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
   ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]](s1)
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY21]](s1), $exec_lo, implicit-def $scc
   ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY19]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY22]](s1), implicit-def $scc
   ; GFX10-NEXT:   [[S_OR_B32_6:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_6]](s1), [[S_AND_B32_6]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY17]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[COPY24:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   bb.0:
     successors: %bb.7(0x80000000)

>From 2d7e6f112cad920191031b8b9c64f84455f91d83 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic at amd.com>
Date: Mon, 4 Dec 2023 15:35:03 +0100
Subject: [PATCH 6/6] fix

---
 .../AMDGPUGlobalISelDivergenceLowering.cpp    |  59 ++++++---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   5 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |   2 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.h      |  23 +++-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |   5 +-
 ...divergent-i1-phis-no-lane-mask-merging.mir |   2 +-
 ...ergence-divergent-i1-used-outside-loop.mir |   2 +-
 .../GlobalISel/divergence-structurizer.ll     |  12 +-
 .../GlobalISel/divergence-structurizer.mir    |   2 +-
 .../divergence-temporal-divergent-i1.mir      |   2 +-
 .../divergence-temporal-divergent-reg.mir     |   2 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll | 122 ++++++++++--------
 12 files changed, 142 insertions(+), 96 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index a955a08b0a377..42bb6aa9d0b6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -5,12 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// GlobalISel pass that selects divergent i1 phis as lane mask phis.
-// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
-// Handles all cases of temporal divergence.
 //
-// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
-// currently depends on LCSSA to insert phis with one incoming.
+/// \file
+/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
+/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
+/// Handles all cases of temporal divergence.
+/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
+/// currently depends on LCSSA to insert phis with one incoming.
+//
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
@@ -20,7 +22,7 @@
 #include "llvm/CodeGen/MachineUniformityAnalysis.h"
 #include "llvm/InitializePasses.h"
 
-#define DEBUG_TYPE "global-isel-divergence-lowering"
+#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
 
 using namespace llvm;
 
@@ -39,7 +41,7 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &MF) override;
 
   StringRef getPassName() const override {
-    return "GlobalISel divergence lowering";
+    return "AMDGPU GlobalISel divergence lowering";
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -91,7 +93,6 @@ void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
     return;
 
   MRI->setRegClass(DstReg, ST->getBoolRC());
-  return;
 }
 
 void DivergenceLoweringHelper::getCandidatesForLowering(
@@ -107,7 +108,6 @@ void DivergenceLoweringHelper::getCandidatesForLowering(
     }
   }
 
-  return;
 }
 
 void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
@@ -128,13 +128,37 @@ void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
 static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
 getInsertAfterPtrs(MachineInstr *MI) {
   MachineBasicBlock *InsertMBB = MI->getParent();
-  return std::make_pair(
-      InsertMBB, InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator())));
+  return {InsertMBB,
+          InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator()))};
 }
 
+// bb.previous
+// %PrevReg = ...
+//
+// bb.current
+// %CurReg = ...
+//
+// %DstReg - not defined 
+//
+// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
+//
+// bb.previous
+// %PrevReg = ...
+// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
+//
+// bb.current
+// %CurReg = ...
+// %CurRegCopy:sreg_32(s1) = COPY %CurReg
+// ...
+// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
+// %CurMaskedReg:sreg_32(s1)  = AND %ExecReg, CurRegCopy - inactive lanes to 0
+// %DstReg:sreg_32(s1)        = OR %PrevMaskedReg, CurMaskedReg
+//
+// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
 void DivergenceLoweringHelper::buildMergeLaneMasks(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
     Register DstReg, Register PrevReg, Register CurReg) {
+  // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
   // TODO: check if inputs are constants or results of a compare.
 
   Register PrevRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -158,8 +182,6 @@ void DivergenceLoweringHelper::buildMergeLaneMasks(
   BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
       .addReg(PrevMaskedReg)
       .addReg(CurMaskedReg);
-
-  return;
 }
 
 // GlobalISel has to constrain S1 incoming taken as-is with lane mask register
@@ -173,19 +195,17 @@ void DivergenceLoweringHelper::constrainIncomingRegisterTakenAsIs(
   auto Copy = B.buildCopy(LLT::scalar(1), In.Reg);
   MRI->setRegClass(Copy.getReg(0), ST->getBoolRC());
   In.Reg = Copy.getReg(0);
-
-  return;
 }
 
 } // End anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
-                      "GlobalISel divergence lowering", false, false)
+                      "AMDGPU GlobalISel divergence lowering", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
-                    "GlobalISel divergence lowering", false, false)
+                    "AMDGPU GlobalISel divergence lowering", false, false)
 
 char AMDGPUGlobalISelDivergenceLowering::ID = 0;
 
@@ -208,6 +228,7 @@ bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
   DivergenceLoweringHelper Helper(
       &MF, &DomTree, &getAnalysis<MachinePostDominatorTree>(), &MUI);
 
-  Helper.lowerPhis();
-  return true;
+  bool Changed = false;
+  Changed |= Helper.lowerPhis();
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3146f7f5a2185..58e1af50c6eba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -212,10 +212,7 @@ bool isLaneMask(Register Reg, MachineRegisterInfo *MRI,
   if (MRI->getType(Reg) != LLT::scalar(1))
     return false;
   const TargetRegisterClass *RC = MRI->getRegClassOrNull(Reg);
-  if (!RC || !TRI.isSGPRClass(RC))
-    return false;
-
-  return true;
+  return RC && TRI.isSGPRClass(RC);
 }
 
 // PHI where all register operands are sgpr(register class) with S1 LLT.
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 59c05bab531f4..4472d6f2e48f6 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -411,7 +411,7 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-Register createLaneMaskReg(MachineRegisterInfo *MRI,
+Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
                            Register *LaneMaskRegAttrs) {
   return MRI->cloneVirtualRegister(*LaneMaskRegAttrs);
 }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index 99de6ecee5777..b2fccad864103 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -1,10 +1,27 @@
+//===-- SILowerI1Copies.h --------------------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Interface definition of the PhiLoweringHelper class that implements lane
+/// mask merging algorithm for divergent i1 phis.
+//
+//===----------------------------------------------------------------------===//
+
 #include "GCNSubtarget.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineSSAUpdater.h"
 
-using namespace llvm;
+namespace llvm {
 
+/// Incoming for lane maks phi as machine instruction, incoming register \p Reg
+/// and incoming block \p Block are taken from machine instruction.
+/// \p UpdatedReg (if valid) is \p Reg lane mask merged with another lane mask.
 struct Incoming {
   Register Reg;
   MachineBasicBlock *Block;
@@ -76,4 +93,6 @@ class PhiLoweringHelper {
                                    const DebugLoc &DL, Register DstReg,
                                    Register PrevReg, Register CurReg) = 0;
   virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
-};
\ No newline at end of file
+};
+
+} // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 4db4fc5fd9ec9..38c2cd0660df7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -221,7 +221,7 @@ exit:
 
 ; There is a divergent, according to machine uniformity info, g_brcond branch
 ; here, not lowered to si_if because of "amdgpu-flat-work-group-size"="1,1".
-define dllexport amdgpu_cs void @_amdgpu_cs_main( i32 inreg noundef %.userdata0, <3 x i32> inreg noundef %.WorkgroupId, <3 x i32> noundef %.LocalInvocationId) #0 {
+define amdgpu_cs void @_amdgpu_cs_main( i32 inreg noundef %.userdata0, <3 x i32> inreg noundef %.WorkgroupId, <3 x i32> noundef %.LocalInvocationId) #0 {
 ; GFX10-LABEL: _amdgpu_cs_main:
 ; GFX10:       ; %bb.0: ; %.entry
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
@@ -331,9 +331,6 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
 declare i64 @llvm.amdgcn.s.getpc()
 
-; Function Attrs: nounwind willreturn memory(none)
-declare ptr addrspace(7) @lgc.buffer.desc.to.ptr(<4 x i32>)
-
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
 declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg)
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index ce0ceb243b9c7..afeea23a2915f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 --- |
   define void @divergent_i1_phi_uniform_branch() {ret void}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index d4a5903db8adb..ef68c51f47a9a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: divergent_i1_phi_used_outside_loop
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index bfa5c9cddde84..53da86fd0b554 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
@@ -43,13 +42,12 @@ define amdgpu_ps void @divergent_i1_phi_if_else(i32 addrspace(1)* %out, i32 %tid
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_and_b32 s0, 1, s0
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s2, 0, s0
-; GFX10-NEXT:    ; implicit-def: $sgpr0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s1, exec_lo, s1
 ; GFX10-NEXT:  ; %bb.1: ; %B
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 2, v2
-; GFX10-NEXT:    s_andn2_b32 s0, s2, exec_lo
+; GFX10-NEXT:    s_andn2_b32 s0, s0, exec_lo
 ; GFX10-NEXT:    ; implicit-def: $vgpr2
 ; GFX10-NEXT:    s_and_b32 s2, exec_lo, vcc_lo
 ; GFX10-NEXT:    s_or_b32 s0, s0, s2
@@ -210,7 +208,7 @@ define amdgpu_cs void @loop_with_2breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-NEXT:    ; in Loop: Header=BB3_3 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v10, vcc_lo, v5, v8, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s4, -1
 ; GFX10-NEXT:    global_load_dword v9, v[9:10], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v9
@@ -307,7 +305,7 @@ define amdgpu_cs void @loop_with_3breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v4, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v5, v10, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 1
+; GFX10-NEXT:    s_mov_b32 s4, -1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
@@ -317,7 +315,7 @@ define amdgpu_cs void @loop_with_3breaks(i32 addrspace(1)* %x, i32 addrspace(1)*
 ; GFX10-NEXT:    ; in Loop: Header=BB4_4 Depth=1
 ; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v6, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v12, vcc_lo, v7, v10, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s5, 1
+; GFX10-NEXT:    s_mov_b32 s5, -1
 ; GFX10-NEXT:    global_load_dword v11, v[11:12], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index d6f225e3ab978..68392e44d0619 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: divergent_i1_phi_if_then
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index a00be03d9c86b..8248b66118d46 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: temporal_divergent_i1_phi
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
index cc6ab89b34b39..4cc68029489e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: temporal_divergent_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 4caf83774bbba..acfea72cee42f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -5,7 +5,6 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11_W32 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX11_W64 %s
-; REQUIRES: do-not-run-me
 
 define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:
@@ -1365,25 +1364,28 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_load_dwordx3 v[1:3], v[1:2], s[4:7], 0 addr64
-; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v0
+; GFX7-NEXT:    s_mov_b64 vcc, 0
+; GFX7-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX7-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX7-NEXT:  ; %bb.1: ; %bb
-; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x14
+; GFX7-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x14
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX7-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX7-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX7-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX7-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX7-NEXT:    s_and_b32 s2, 1, s2
+; GFX7-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX7-NEXT:    s_andn2_b64 s[8:9], 0, exec
+; GFX7-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
+; GFX7-NEXT:    s_or_b64 vcc, s[8:9], s[2:3]
 ; GFX7-NEXT:  .LBB13_2: ; %exit
-; GFX7-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX7-NEXT:    s_and_b32 s0, 1, s6
-; GFX7-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    s_nop 1
 ; GFX7-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
+; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:8
 ; GFX7-NEXT:    s_endpgm
@@ -1392,34 +1394,36 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX8:       ; %bb.0: ; %entry
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GFX8-NEXT:    s_mov_b32 s4, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    flat_load_dwordx3 v[1:3], v[1:2]
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX8-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v0
+; GFX8-NEXT:    s_mov_b64 vcc, 0
+; GFX8-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX8-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX8-NEXT:  ; %bb.1: ; %bb
-; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x50
+; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x50
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX8-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX8-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX8-NEXT:    s_and_b32 s2, 1, s2
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX8-NEXT:    s_andn2_b64 s[6:7], 0, exec
+; GFX8-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
+; GFX8-NEXT:    s_or_b64 vcc, s[6:7], s[2:3]
 ; GFX8-NEXT:  .LBB13_2: ; %exit
-; GFX8-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_add_u32 s0, s0, 8
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    s_and_b32 s2, 1, s4
-; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
-; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    s_nop 2
-; GFX8-NEXT:    v_div_fmas_f32 v2, v1, v2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1429,12 +1433,11 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W32:       ; %bb.0: ; %entry
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
 ; GFX10_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W32-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX10_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
-; GFX10_W32-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_W32-NEXT:    s_mov_b32 s2, 0
-; GFX10_W32-NEXT:    s_and_saveexec_b32 s3, vcc_lo
+; GFX10_W32-NEXT:    v_cmp_eq_u32_e64 s2, 0, v0
+; GFX10_W32-NEXT:    s_and_saveexec_b32 s3, s2
 ; GFX10_W32-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W32-NEXT:  ; %bb.1: ; %bb
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x50
@@ -1443,11 +1446,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W32-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10_W32-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10_W32-NEXT:    s_andn2_b32 s4, 0, exec_lo
+; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
+; GFX10_W32-NEXT:    s_and_b32 s2, exec_lo, s2
+; GFX10_W32-NEXT:    s_or_b32 vcc_lo, s4, s2
 ; GFX10_W32-NEXT:  .LBB13_2: ; %exit
 ; GFX10_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX10_W32-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX10_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX10_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX10_W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -1459,25 +1465,27 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX10_W64:       ; %bb.0: ; %entry
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x28
 ; GFX10_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX10_W64-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX10_W64-NEXT:    s_mov_b32 s4, 0
+; GFX10_W64-NEXT:    s_mov_b64 vcc, 0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10_W64-NEXT:    global_load_dwordx3 v[1:3], v1, s[2:3]
-; GFX10_W64-NEXT:    s_waitcnt_depctr 0xffe3
-; GFX10_W64-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX10_W64-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v0
+; GFX10_W64-NEXT:    s_and_saveexec_b64 s[4:5], s[2:3]
 ; GFX10_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX10_W64-NEXT:  ; %bb.1: ; %bb
-; GFX10_W64-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x50
+; GFX10_W64-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x50
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GFX10_W64-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10_W64-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX10_W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX10_W64-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10_W64-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX10_W64-NEXT:    s_andn2_b64 s[6:7], 0, exec
+; GFX10_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX10_W64-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
+; GFX10_W64-NEXT:    s_or_b64 vcc, s[6:7], s[2:3]
 ; GFX10_W64-NEXT:  .LBB13_2: ; %exit
-; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX10_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX10_W64-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX10_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX10_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX10_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX10_W64-NEXT:    v_mov_b32_e32 v1, 0
@@ -1489,9 +1497,9 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W32:       ; %bb.0: ; %entry
 ; GFX11_W32-NEXT:    s_load_b64 s[2:3], s[0:1], 0x28
 ; GFX11_W32-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX11_W32-NEXT:    s_mov_b32 vcc_lo, 0
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    global_load_b96 v[1:3], v1, s[2:3]
-; GFX11_W32-NEXT:    s_mov_b32 s2, 0
 ; GFX11_W32-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX11_W32-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11_W32-NEXT:    s_cbranch_execz .LBB13_2
@@ -1502,11 +1510,14 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W32-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11_W32-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11_W32-NEXT:    s_and_not1_b32 s4, 0, exec_lo
+; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 s2, 0, s2
+; GFX11_W32-NEXT:    s_and_b32 s2, exec_lo, s2
+; GFX11_W32-NEXT:    s_or_b32 vcc_lo, s4, s2
 ; GFX11_W32-NEXT:  .LBB13_2: ; %exit
 ; GFX11_W32-NEXT:    s_or_b32 exec_lo, exec_lo, s3
 ; GFX11_W32-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11_W32-NEXT:    s_and_b32 s2, 1, s2
-; GFX11_W32-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s2
 ; GFX11_W32-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11_W32-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX11_W32-NEXT:    v_mov_b32_e32 v1, 0
@@ -1520,24 +1531,27 @@ define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, [
 ; GFX11_W64:       ; %bb.0: ; %entry
 ; GFX11_W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x28
 ; GFX11_W64-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
-; GFX11_W64-NEXT:    s_mov_b32 s4, 0
+; GFX11_W64-NEXT:    s_mov_b64 vcc, 0
+; GFX11_W64-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11_W64-NEXT:    global_load_b96 v[1:3], v1, s[2:3]
-; GFX11_W64-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX11_W64-NEXT:    v_cmpx_eq_u32_e32 0, v0
 ; GFX11_W64-NEXT:    s_cbranch_execz .LBB13_2
 ; GFX11_W64-NEXT:  ; %bb.1: ; %bb
-; GFX11_W64-NEXT:    s_load_b64 s[4:5], s[0:1], 0x50
+; GFX11_W64-NEXT:    s_load_b64 s[2:3], s[0:1], 0x50
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_load_b32 s4, s[4:5], 0x0
+; GFX11_W64-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11_W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11_W64-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX11_W64-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX11_W64-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11_W64-NEXT:    s_cselect_b32 s2, 1, 0
+; GFX11_W64-NEXT:    s_and_not1_b64 s[6:7], 0, exec
+; GFX11_W64-NEXT:    s_and_b32 s2, 1, s2
+; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, s2
+; GFX11_W64-NEXT:    s_and_b64 s[2:3], exec, s[2:3]
+; GFX11_W64-NEXT:    s_or_b64 vcc, s[6:7], s[2:3]
 ; GFX11_W64-NEXT:  .LBB13_2: ; %exit
-; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX11_W64-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX11_W64-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11_W64-NEXT:    s_and_b32 s2, 1, s4
-; GFX11_W64-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s2
 ; GFX11_W64-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11_W64-NEXT:    v_div_fmas_f32 v0, v1, v2, v3
 ; GFX11_W64-NEXT:    v_mov_b32_e32 v1, 0



More information about the llvm-commits mailing list