[llvm-branch-commits] [llvm] [AMDGPU] Support Wave Reduction for i16 types - 3 (PR #194812)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Jun 10 01:02:09 PDT 2026


https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/194812

>From 7bffc5f92d62eb7bf1ba7047a545f88d1c4c3140 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 28 Apr 2026 15:26:08 +0530
Subject: [PATCH] [AMDGPU] Support Wave Reduction for i16 types - 3

Supported Ops: `and`, `or`, `xor`.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   5 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   5 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll  | 699 ++++++++++++---
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll   | 699 ++++++++++++---
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll  | 833 ++++++++++++++----
 5 files changed, 1807 insertions(+), 434 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b5db6ea06695d..e341392ae068a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -8174,7 +8174,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_wave_reduce_max:
   case Intrinsic::amdgcn_wave_reduce_umax:
   case Intrinsic::amdgcn_wave_reduce_add:
-  case Intrinsic::amdgcn_wave_reduce_sub: {
+  case Intrinsic::amdgcn_wave_reduce_sub:
+  case Intrinsic::amdgcn_wave_reduce_and:
+  case Intrinsic::amdgcn_wave_reduce_or:
+  case Intrinsic::amdgcn_wave_reduce_xor: {
     Register SrcReg = MI.getOperand(2).getReg();
     if (MRI.getType(SrcReg) != LLT::scalar(16))
       return true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 93b2e70f42225..a6dc58e6da263 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10765,7 +10765,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_wave_reduce_max:
   case Intrinsic::amdgcn_wave_reduce_umax:
   case Intrinsic::amdgcn_wave_reduce_add:
-  case Intrinsic::amdgcn_wave_reduce_sub: {
+  case Intrinsic::amdgcn_wave_reduce_sub:
+  case Intrinsic::amdgcn_wave_reduce_and:
+  case Intrinsic::amdgcn_wave_reduce_or:
+  case Intrinsic::amdgcn_wave_reduce_xor: {
     EVT SrcVT = Op.getOperand(1).getValueType();
     if (SrcVT == MVT::i16) {
       bool NeedsSignExt = IntrinsicID == Intrinsic::amdgcn_wave_reduce_min ||
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
index e39f81a2f6267..2848a793cec7e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -7,10 +7,437 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
+
+define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i16:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i16:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i16:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i16:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i16:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i16:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i16:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16:
+; GFX1164GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX1164GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i16:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16:
+; GFX1132GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX1132GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16:
+; GFX1164GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX1164GISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16:
+; GFX1132GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX1132GISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT:    s_endpgm
+  entry:
+  %result = call i16 @llvm.amdgcn.wave.reduce.and.i16(i16 %in, i32 1)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_i16:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i16:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i16:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i16:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i16:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, -1
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i16:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, -1
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_and_b32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i16:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s6
+; GFX1032DAGISEL-NEXT:    s_and_b32 s4, s4, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i16:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, -1
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s5, s6
+; GFX1032GISEL-NEXT:    s_and_b32 s4, s4, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16:
+; GFX1164DAGISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1164DAGISEL-FAKE16-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164DAGISEL-FAKE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT:    s_and_b32 s2, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i16:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, -1
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, s4
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16:
+; GFX1132DAGISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT:    s_mov_b32 s0, -1
+; GFX1132DAGISEL-FAKE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132DAGISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_and_b32 s0, s0, s3
+; GFX1132DAGISEL-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-FAKE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i16:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132GISEL-NEXT:    s_and_b32 s0, s0, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16:
+; GFX1164DAGISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX1164DAGISEL-TRUE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT:    s_and_b32 s2, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16:
+; GFX1132DAGISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT:    s_mov_b32 s0, -1
+; GFX1132DAGISEL-TRUE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132DAGISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT:    v_readlane_b32 s3, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132DAGISEL-TRUE16-NEXT:    s_and_b32 s0, s0, s3
+; GFX1132DAGISEL-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-TRUE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  entry:
+  %result = call i16 @llvm.amdgcn.wave.reduce.and.i16(i16 %in, i32 1)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
 
 define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-LABEL: uniform_value:
@@ -130,13 +557,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s4, -1
-; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX8DAGISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -150,13 +577,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, -1
-; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX8GISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -171,13 +598,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s4, -1
-; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX9DAGISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -189,13 +616,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, -1
-; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX9GISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -209,13 +636,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, -1
-; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -227,13 +654,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, -1
-; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1064GISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -247,13 +674,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, -1
-; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, s5
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -265,13 +692,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, s5
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -286,14 +713,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, -1
-; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -306,14 +733,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, -1
-; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1164GISEL-NEXT:    s_and_b32 s4, s4, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -327,14 +754,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, -1
-; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -347,14 +774,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, -1
-; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2012,20 +2439,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2040,30 +2467,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.5:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2086,20 +2513,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2113,30 +2540,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.5:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2158,20 +2585,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2185,30 +2612,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.5:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2230,20 +2657,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2257,30 +2684,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s1, s1
-; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.5:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2304,21 +2731,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2334,31 +2761,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.5:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2382,21 +2809,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2412,31 +2839,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, s1
-; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.5:
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2574,14 +3001,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2594,14 +3021,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2614,14 +3041,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2634,14 +3061,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2654,14 +3081,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2673,14 +3100,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2692,14 +3119,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2711,14 +3138,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2730,7 +3157,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2738,7 +3165,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2750,7 +3177,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2758,7 +3185,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2770,7 +3197,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2778,7 +3205,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2789,7 +3216,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2797,7 +3224,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2838,24 +3265,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], s[4:5]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2890,24 +3317,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2942,24 +3369,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1064GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2994,24 +3421,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3050,17 +3477,17 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1164GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3068,7 +3495,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3105,23 +3532,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[4:5], s[4:5]
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
index aa9735676d697..ae21f8bf67eb4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -7,10 +7,437 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
+
+define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i16:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i16:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i16:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i16:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX10DAGISEL-LABEL: uniform_value_i16:
+; GFX10DAGISEL:       ; %bb.0: ; %entry
+; GFX10DAGISEL-NEXT:    s_clause 0x1
+; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX10DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX10DAGISEL-NEXT:    s_endpgm
+;
+; GFX10GISEL-LABEL: uniform_value_i16:
+; GFX10GISEL:       ; %bb.0: ; %entry
+; GFX10GISEL-NEXT:    s_clause 0x1
+; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX10GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX10GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i16:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16:
+; GFX1164GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX1164GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i16:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16:
+; GFX1132GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX1132GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16:
+; GFX1164GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX1164GISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1164GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16:
+; GFX1132GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX1132GISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT:    s_endpgm
+  entry:
+  %result = call i16 @llvm.amdgcn.wave.reduce.or.i16(i16 %in, i32 1)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_i16:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i16:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i16:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i16:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i16:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i16:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_or_b32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i16:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s6
+; GFX1032DAGISEL-NEXT:    s_or_b32 s4, s4, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i16:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s5, s6
+; GFX1032GISEL-NEXT:    s_or_b32 s4, s4, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16:
+; GFX1164DAGISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1164DAGISEL-FAKE16-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX1164DAGISEL-FAKE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT:    s_or_b32 s2, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i16:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_or_b32 s2, s2, s4
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16:
+; GFX1132DAGISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-FAKE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132DAGISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_or_b32 s0, s0, s3
+; GFX1132DAGISEL-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-FAKE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i16:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132GISEL-NEXT:    s_or_b32 s0, s0, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16:
+; GFX1164DAGISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX1164DAGISEL-TRUE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT:    s_or_b32 s2, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16:
+; GFX1132DAGISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-TRUE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132DAGISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT:    v_readlane_b32 s3, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132DAGISEL-TRUE16-NEXT:    s_or_b32 s0, s0, s3
+; GFX1132DAGISEL-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-TRUE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  entry:
+  %result = call i16 @llvm.amdgcn.wave.reduce.or.i16(i16 %in, i32 1)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
 
 define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-LABEL: uniform_value:
@@ -130,13 +557,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX8DAGISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -150,13 +577,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX8GISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -171,13 +598,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX9DAGISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -189,13 +616,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX9GISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -209,13 +636,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1064DAGISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -227,13 +654,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1064GISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -247,13 +674,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1032DAGISEL-NEXT:    s_or_b32 s2, s2, s5
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -265,13 +692,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1032GISEL-NEXT:    s_or_b32 s2, s2, s5
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -286,14 +713,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1164DAGISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -306,14 +733,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1164GISEL-NEXT:    s_or_b32 s4, s4, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -327,14 +754,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1132DAGISEL-NEXT:    s_or_b32 s2, s2, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -347,14 +774,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1132GISEL-NEXT:    s_or_b32 s2, s2, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2012,20 +2439,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2040,30 +2467,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.5:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2086,20 +2513,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2113,30 +2540,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.5:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2158,20 +2585,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2185,30 +2612,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.5:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2230,20 +2657,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2257,30 +2684,30 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s1, s1
-; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.5:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2304,21 +2731,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2334,31 +2761,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s2, s2
-; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.5:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2382,21 +2809,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2412,31 +2839,31 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, s1
-; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.5:
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2574,14 +3001,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2594,14 +3021,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2614,14 +3041,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2634,14 +3061,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2654,14 +3081,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2673,14 +3100,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2692,14 +3119,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2711,14 +3138,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2730,7 +3157,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2738,7 +3165,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2750,7 +3177,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2758,7 +3185,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2770,7 +3197,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2778,7 +3205,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2789,7 +3216,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2797,7 +3224,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2839,24 +3266,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], s[4:5]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2891,24 +3318,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2943,24 +3370,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1064GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -2995,24 +3422,24 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], s[6:7]
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3051,17 +3478,17 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1164GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -3069,7 +3496,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3106,23 +3533,23 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[4:5], s[4:5]
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
index 057abf6a2d48b..a6c2ef6aaa5a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -7,10 +7,523 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1064GISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1032DAGISEL %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1032GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164DAGISEL,GFX1164DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX1164GISEL,GFX1164GISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=0 < %s | FileCheck -check-prefixes=GFX1132DAGISEL,GFX1132DAGISEL-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -global-isel=1 -new-reg-bank-select < %s | FileCheck -check-prefixes=GFX1132GISEL,GFX1132GISEL-TRUE16 %s
+
+define amdgpu_kernel void @uniform_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: uniform_value_i16:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8DAGISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_endpgm
+;
+; GFX8GISEL-LABEL: uniform_value_i16:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX8GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX8GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8GISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT:    s_endpgm
+;
+; GFX9DAGISEL-LABEL: uniform_value_i16:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9DAGISEL-NEXT:    s_endpgm
+;
+; GFX9GISEL-LABEL: uniform_value_i16:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX9GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX9GISEL-NEXT:    s_endpgm
+;
+; GFX1064DAGISEL-LABEL: uniform_value_i16:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_clause 0x1
+; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1064DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX1064DAGISEL-NEXT:    s_endpgm
+;
+; GFX1064GISEL-LABEL: uniform_value_i16:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_clause 0x1
+; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX1064GISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1064GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX1064GISEL-NEXT:    s_endpgm
+;
+; GFX1032DAGISEL-LABEL: uniform_value_i16:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_clause 0x1
+; GFX1032DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1032DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1032DAGISEL-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX1032DAGISEL-NEXT:    s_endpgm
+;
+; GFX1032GISEL-LABEL: uniform_value_i16:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_clause 0x1
+; GFX1032GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1032GISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1032GISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1032GISEL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX1032GISEL-NEXT:    s_endpgm
+;
+; GFX1164DAGISEL-LABEL: uniform_value_i16:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_clause 0x1
+; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1164DAGISEL-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX1164DAGISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_endpgm
+;
+; GFX1164GISEL-FAKE16-LABEL: uniform_value_i16:
+; GFX1164GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1164GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX1164GISEL-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-FAKE16-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-FAKE16-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-FAKE16-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX1164GISEL-FAKE16-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1164GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX1132DAGISEL-LABEL: uniform_value_i16:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_clause 0x1
+; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132DAGISEL-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1132DAGISEL-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1132DAGISEL-NEXT:    s_endpgm
+;
+; GFX1132GISEL-FAKE16-LABEL: uniform_value_i16:
+; GFX1132GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1132GISEL-FAKE16-NEXT:    s_clause 0x1
+; GFX1132GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-FAKE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-FAKE16-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-FAKE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132GISEL-FAKE16-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
+; GFX1132GISEL-FAKE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-FAKE16-NEXT:    s_endpgm
+;
+; GFX1164GISEL-TRUE16-LABEL: uniform_value_i16:
+; GFX1164GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1164GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX1164GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x2c
+; GFX1164GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1164GISEL-TRUE16-NEXT:    s_mov_b64 s[2:3], exec
+; GFX1164GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1164GISEL-TRUE16-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1164GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 1
+; GFX1164GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-TRUE16-NEXT:    s_and_b32 s3, s6, 0xffff
+; GFX1164GISEL-TRUE16-NEXT:    s_mul_i32 s2, s3, s2
+; GFX1164GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX1164GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1164GISEL-TRUE16-NEXT:    s_endpgm
+;
+; GFX1132GISEL-TRUE16-LABEL: uniform_value_i16:
+; GFX1132GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1132GISEL-TRUE16-NEXT:    s_clause 0x1
+; GFX1132GISEL-TRUE16-NEXT:    s_load_b32 s2, s[4:5], 0x2c
+; GFX1132GISEL-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1132GISEL-TRUE16-NEXT:    s_mov_b32 s3, exec_lo
+; GFX1132GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1132GISEL-TRUE16-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX1132GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 1
+; GFX1132GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-TRUE16-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1132GISEL-TRUE16-NEXT:    s_mul_i32 s2, s2, s3
+; GFX1132GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
+; GFX1132GISEL-TRUE16-NEXT:    global_store_b16 v1, v0, s[0:1]
+; GFX1132GISEL-TRUE16-NEXT:    s_endpgm
+  entry:
+  %result = call i16 @llvm.amdgcn.wave.reduce.xor.i16(i16 %in, i32 1)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define void @divergent_value_i16(ptr addrspace(1) %out, i16 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_i16:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:  ; %bb.2:
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_i16:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX8GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX8GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:  ; %bb.2:
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    flat_store_short v[0:1], v2
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_i16:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:  ; %bb.2:
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_i16:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX9GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX9GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:  ; %bb.2:
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_i16:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:  ; %bb.2:
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_i16:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
+; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[4:5]
+; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s7
+; GFX1064GISEL-NEXT:    s_bitset0_b64 s[4:5], s7
+; GFX1064GISEL-NEXT:    s_xor_b32 s6, s6, s8
+; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:  ; %bb.2:
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_i16:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s5, s6
+; GFX1032DAGISEL-NEXT:    s_xor_b32 s4, s4, s7
+; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:  ; %bb.2:
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_i16:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1032GISEL-NEXT:    s_mov_b32 s5, exec_lo
+; GFX1032GISEL-NEXT:    s_mov_b32 s4, 0
+; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s6, s5
+; GFX1032GISEL-NEXT:    v_readlane_b32 s7, v2, s6
+; GFX1032GISEL-NEXT:    s_bitset0_b32 s5, s6
+; GFX1032GISEL-NEXT:    s_xor_b32 s4, s4, s7
+; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:  ; %bb.2:
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    global_store_short v[0:1], v2, off
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-FAKE16-LABEL: divergent_value_i16:
+; GFX1164DAGISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1164DAGISEL-FAKE16-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-FAKE16-NEXT:    s_mov_b32 s2, 0
+; GFX1164DAGISEL-FAKE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-FAKE16-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-FAKE16-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164DAGISEL-FAKE16-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-FAKE16-NEXT:    s_xor_b32 s2, s2, s4
+; GFX1164DAGISEL-FAKE16-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-FAKE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-FAKE16-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_i16:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164GISEL-NEXT:    s_mov_b32 s2, 0
+; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s3
+; GFX1164GISEL-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164GISEL-NEXT:    s_xor_b32 s2, s2, s4
+; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:  ; %bb.2:
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-FAKE16-LABEL: divergent_value_i16:
+; GFX1132DAGISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-FAKE16-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-FAKE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-FAKE16-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132DAGISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-FAKE16-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132DAGISEL-FAKE16-NEXT:    s_xor_b32 s0, s0, s3
+; GFX1132DAGISEL-FAKE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-FAKE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-FAKE16-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-FAKE16-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132DAGISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-FAKE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_i16:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
+; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s3, v2, s2
+; GFX1132GISEL-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132GISEL-NEXT:    s_xor_b32 s0, s0, s3
+; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:  ; %bb.2:
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-TRUE16-LABEL: divergent_value_i16:
+; GFX1164DAGISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX1164DAGISEL-TRUE16-NEXT:    s_mov_b64 s[0:1], exec
+; GFX1164DAGISEL-TRUE16-NEXT:    s_mov_b32 s2, 0
+; GFX1164DAGISEL-TRUE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-TRUE16-NEXT:    s_ctz_i32_b64 s3, s[0:1]
+; GFX1164DAGISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1164DAGISEL-TRUE16-NEXT:    v_readlane_b32 s4, v3, s3
+; GFX1164DAGISEL-TRUE16-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX1164DAGISEL-TRUE16-NEXT:    s_xor_b32 s2, s2, s4
+; GFX1164DAGISEL-TRUE16-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164DAGISEL-TRUE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-TRUE16-NEXT:  ; %bb.2:
+; GFX1164DAGISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1164DAGISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-TRUE16-LABEL: divergent_value_i16:
+; GFX1132DAGISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.h, 0
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v2.l
+; GFX1132DAGISEL-TRUE16-NEXT:    s_mov_b32 s1, exec_lo
+; GFX1132DAGISEL-TRUE16-NEXT:    s_mov_b32 s0, 0
+; GFX1132DAGISEL-TRUE16-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-TRUE16-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX1132DAGISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1132DAGISEL-TRUE16-NEXT:    v_readlane_b32 s3, v3, s2
+; GFX1132DAGISEL-TRUE16-NEXT:    s_bitset0_b32 s1, s2
+; GFX1132DAGISEL-TRUE16-NEXT:    s_xor_b32 s0, s0, s3
+; GFX1132DAGISEL-TRUE16-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132DAGISEL-TRUE16-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-TRUE16-NEXT:  ; %bb.2:
+; GFX1132DAGISEL-TRUE16-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1132DAGISEL-TRUE16-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1132DAGISEL-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+  entry:
+  %result = call i16 @llvm.amdgcn.wave.reduce.xor.i16(i16 %in, i32 1)
+  store i16 %result, ptr addrspace(1) %out
+  ret void
+}
 
 define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-LABEL: uniform_value:
@@ -206,13 +719,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX8DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX8DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -226,13 +739,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX8GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX8GISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -247,13 +760,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX9DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -265,13 +778,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX9GISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -285,13 +798,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1064DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1064DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -303,13 +816,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1064GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1064GISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -323,13 +836,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1032DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -341,13 +854,13 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1032GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1032GISEL-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -362,14 +875,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1164DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1164DAGISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -382,14 +895,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX1164GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
 ; GFX1164GISEL-NEXT:    s_xor_b32 s4, s4, s6
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
@@ -403,14 +916,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1132DAGISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -423,14 +936,14 @@ define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out) #0 {
 ; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX1132GISEL-NEXT:  .LBB1_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
 ; GFX1132GISEL-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2156,7 +2669,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2165,24 +2678,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2197,7 +2710,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2206,24 +2719,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX8GISEL-NEXT:  ; %bb.5:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX8GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -2238,7 +2751,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2247,24 +2760,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2278,7 +2791,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2287,24 +2800,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX9GISEL-NEXT:  ; %bb.5:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX9GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2318,7 +2831,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2327,24 +2840,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2358,7 +2871,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2367,24 +2880,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1064GISEL-NEXT:  ; %bb.5:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1064GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2398,7 +2911,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -2407,24 +2920,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2438,7 +2951,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -2447,24 +2960,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1032GISEL-NEXT:  ; %bb.5:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1032GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2480,7 +2993,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2490,25 +3003,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2524,7 +3037,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -2534,25 +3047,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164GISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1164GISEL-NEXT:  ; %bb.5:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1164GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2568,7 +3081,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -2578,25 +3091,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2612,7 +3125,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -2622,25 +3135,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) #0 {
 ; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_6
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_4
 ; GFX1132GISEL-NEXT:  ; %bb.5:
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132GISEL-NEXT:  .LBB6_6: ; %endif
+; GFX1132GISEL-NEXT:  .LBB8_6: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -2862,14 +3375,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2882,14 +3395,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2902,14 +3415,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2922,14 +3435,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2942,14 +3455,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2961,14 +3474,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2980,14 +3493,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2999,14 +3512,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -3018,7 +3531,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -3026,7 +3539,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -3038,7 +3551,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -3046,7 +3559,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -3058,7 +3571,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -3066,7 +3579,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -3077,7 +3590,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB10_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -3085,7 +3598,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) #0 {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -3105,7 +3618,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -3113,7 +3626,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -3141,7 +3654,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -3149,13 +3662,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX8GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -3166,7 +3679,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_mul_i32 s5, s5, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -3181,7 +3694,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -3189,7 +3702,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -3216,7 +3729,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -3224,13 +3737,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX9GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -3241,7 +3754,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_mul_i32 s5, s7, s5
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3256,7 +3769,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -3264,7 +3777,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -3291,7 +3804,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -3299,13 +3812,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1064GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -3316,7 +3829,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_mul_i32 s5, s7, s5
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1064GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3331,7 +3844,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -3339,7 +3852,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -3366,7 +3879,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -3374,13 +3887,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1032GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1032GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -3391,7 +3904,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_mul_i32 s5, s7, s3
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1032GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3408,7 +3921,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3417,7 +3930,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -3448,7 +3961,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3457,13 +3970,13 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1164GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX1164GISEL-NEXT:    s_xor_b64 exec, exec, s[2:3]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -3475,7 +3988,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_mul_i32 s5, s5, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1164GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
@@ -3492,7 +4005,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3501,7 +4014,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -3531,7 +4044,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB11_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -3540,12 +4053,12 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1132GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB11_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s2
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB11_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -3557,7 +4070,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_mul_i32 s5, s5, s3
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
-; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB11_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1132GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]



More information about the llvm-branch-commits mailing list