[llvm-branch-commits] [llvm] [AMDGPU] DPP wave reduction for long types - 3 (PR #189226)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Sun Mar 29 04:09:28 PDT 2026


https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/189226

Supported Ops: `and`, `or`, `xor`

>From c32441ef944bf87c91410b5a9f95a136241fd22c Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Sat, 28 Mar 2026 00:07:40 +0530
Subject: [PATCH] [AMDGPU] DPP wave reduction for long types - 3

Supported Ops: `and`, `or`, `xor`
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   13 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll  | 1068 ++++++++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll   | 1068 ++++++++++++++--
 .../CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll  | 1116 +++++++++++++++--
 4 files changed, 2916 insertions(+), 349 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 4eb6bad007d59..dc7f0906159c4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -5738,6 +5738,9 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
   case AMDGPU::V_CMP_GT_I64_e64: // max.i64
   case AMDGPU::S_ADD_U64_PSEUDO:
   case AMDGPU::S_SUB_U64_PSEUDO:
+  case AMDGPU::S_AND_B64:
+  case AMDGPU::S_OR_B64:
+  case AMDGPU::S_XOR_B64:
     DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
     break;
   default:
@@ -5749,6 +5752,12 @@ getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST) {
       ClampOpc = AMDGPU::S_ADD_I32;
     if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
       ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
+    else if (Opc == AMDGPU::S_AND_B64)
+      ClampOpc = AMDGPU::V_AND_B32_e64;
+    else if (Opc == AMDGPU::S_OR_B64)
+      ClampOpc = AMDGPU::V_OR_B32_e64;
+    else if (Opc == AMDGPU::S_XOR_B64)
+      ClampOpc = AMDGPU::V_XOR_B32_e64;
     else
       ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
   }
@@ -6349,8 +6358,10 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
       auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
         bool isAddSubOpc =
             Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
+        bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||
+                            Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;
         Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
-        if (isAddSubOpc) {
+        if (isAddSubOpc || isBitWiseOpc) {
           Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
           Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
           MachineOperand Src0Operand =
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
index a767a6ab72828..ca82764a788f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.and.ll
@@ -812,6 +812,858 @@ entry:
   ret void
 }
 
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s6
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s6
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s2
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_and_b32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_and_b32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_and_b32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, -1, v2, s2
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_and_b32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_and_b32 v4, v4, v6
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_and_b32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_and_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.and.i64(i64 %in, i32 2)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
 ; GFX8DAGISEL-LABEL: default_stratergy:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -1160,20 +2012,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1188,26 +2040,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -1231,20 +2083,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1258,26 +2110,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1300,20 +2152,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1327,26 +2179,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1369,20 +2221,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1396,26 +2248,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, -1
-; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_and_b32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1440,21 +2292,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1470,27 +2322,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, -1
-; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_and_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1515,21 +2367,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, -1
-; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1545,27 +2397,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, -1
-; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_and_b32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1703,14 +2555,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1723,14 +2575,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1743,14 +2595,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1763,14 +2615,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1783,14 +2635,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1802,14 +2654,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1821,14 +2673,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1840,14 +2692,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], -1
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1859,7 +2711,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1867,7 +2719,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1879,7 +2731,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1887,7 +2739,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1899,7 +2751,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1907,7 +2759,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1918,7 +2770,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], -1
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1926,7 +2778,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1967,19 +2819,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2016,19 +2868,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2065,19 +2917,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2114,19 +2966,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2167,19 +3019,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2218,19 +3070,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
index 89487bc210bff..f3b28281c4633 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.or.ll
@@ -812,6 +812,858 @@ entry:
   ret void
 }
 
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_or_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.or.i64(i64 %in, i32 2)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
 ; GFX8DAGISEL-LABEL: default_stratergy:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -1160,20 +2012,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1188,26 +2040,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -1231,20 +2083,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1258,26 +2110,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1300,20 +2152,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1327,26 +2179,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1369,20 +2221,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1396,26 +2248,26 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_or_b32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1440,21 +2292,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1470,27 +2322,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_or_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1515,21 +2367,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_or_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1545,27 +2397,27 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_or_b32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1703,14 +2555,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1723,14 +2575,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1743,14 +2595,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1763,14 +2615,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1783,14 +2635,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1802,14 +2654,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1821,14 +2673,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1840,14 +2692,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -1859,7 +2711,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1867,7 +2719,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1879,7 +2731,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -1887,7 +2739,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -1899,7 +2751,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1907,7 +2759,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1918,7 +2770,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -1926,7 +2778,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -1968,19 +2820,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2017,19 +2869,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2066,19 +2918,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2115,19 +2967,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[6:7], s[6:7]
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2168,19 +3020,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2219,19 +3071,19 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
index 63c42dee8a71a..1ecfc082b8cbc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.xor.ll
@@ -964,6 +964,858 @@ entry:
   ret void
 }
 
+define void @divergent_value_dpp_i64(ptr addrspace(1) %out, i64 %in) {
+; GFX8DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX8DAGISEL:       ; %bb.0: ; %entry
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8DAGISEL-NEXT:    s_nop 0
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8DAGISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8GISEL-LABEL: divergent_value_dpp_i64:
+; GFX8GISEL:       ; %bb.0: ; %entry
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX8GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8GISEL-NEXT:    s_nop 0
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX8GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX8GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX8GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8GISEL-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX8GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX8GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX8GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX9DAGISEL:       ; %bb.0: ; %entry
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9DAGISEL-NEXT:    s_nop 0
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9GISEL-LABEL: divergent_value_dpp_i64:
+; GFX9GISEL:       ; %bb.0: ; %entry
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX9GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:15 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9GISEL-NEXT:    s_nop 0
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_bcast:31 row_mask:0xf bank_mask:0xf
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX9GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX9GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX9GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX9GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX9GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX9GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064DAGISEL:       ; %bb.0: ; %entry
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064DAGISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064DAGISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064DAGISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064DAGISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1064GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1064GISEL:       ; %bb.0: ; %entry
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[4:5]
+; GFX1064GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[4:5]
+; GFX1064GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1064GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1064GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX1064GISEL-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s6
+; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s7
+; GFX1064GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1064GISEL-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX1064GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1064GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1064GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1064GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1064GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1064GISEL-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:16
+; GFX1064GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1064GISEL-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1064GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032DAGISEL:       ; %bb.0: ; %entry
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032DAGISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032DAGISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032DAGISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032DAGISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1032GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1032GISEL:       ; %bb.0: ; %entry
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_or_saveexec_b32 s6, -1
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s6
+; GFX1032GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1032GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1032GISEL-NEXT:    v_readlane_b32 s4, v4, 31
+; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v5, 31
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s6
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
+; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
+; GFX1032GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off
+; GFX1032GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
+; GFX1032GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1032GISEL-NEXT:    buffer_load_dword v4, off, s[0:3], s32
+; GFX1032GISEL-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4
+; GFX1032GISEL-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:8
+; GFX1032GISEL-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:12
+; GFX1032GISEL-NEXT:    s_waitcnt_depctr depctr_vm_vsrc(0)
+; GFX1032GISEL-NEXT:    s_mov_b32 exec_lo, s4
+; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1032GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164DAGISEL:       ; %bb.0: ; %entry
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164DAGISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164DAGISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164DAGISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164DAGISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164DAGISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164DAGISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164DAGISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1164GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1164GISEL:       ; %bb.0: ; %entry
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Spill
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_store_b32 off, v8, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s[0:1]
+; GFX1164GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s[0:1]
+; GFX1164GISEL-NEXT:    v_mbcnt_lo_u32_b32 v8, -1, 0
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    v_mbcnt_hi_u32_b32 v8, -1, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    v_mul_lo_u32 v8, 4, v8
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v6, v4
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    ds_permute_b32 v6, v8, v4
+; GFX1164GISEL-NEXT:    ds_permute_b32 v7, v8, v5
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1164GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1164GISEL-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX1164GISEL-NEXT:    v_readlane_b32 s3, v5, 63
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1164GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1164GISEL-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX1164GISEL-NEXT:    s_waitcnt_depctr depctr_sa_sdst(0)
+; GFX1164GISEL-NEXT:    s_clause 0x4 ; 20-byte Folded Reload
+; GFX1164GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1164GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1164GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1164GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1164GISEL-NEXT:    scratch_load_b32 v8, off, s32 offset:16
+; GFX1164GISEL-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1164GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132DAGISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132DAGISEL:       ; %bb.0: ; %entry
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132DAGISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132DAGISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132DAGISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132DAGISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132DAGISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132DAGISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1132GISEL-LABEL: divergent_value_dpp_i64:
+; GFX1132GISEL:       ; %bb.0: ; %entry
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v4, s32
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v5, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v6, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_store_b32 off, v7, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_or_saveexec_b32 s2, -1
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v2, s2
+; GFX1132GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v3, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v7, v5
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v6, v6 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_mov_b32_dpp v7, v7 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v6, v4 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    ds_swizzle_b32 v7, v5 offset:swizzle(BROADCAST,32,15)
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(1)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v4, v4, v6
+; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX1132GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1132GISEL-NEXT:    v_readlane_b32 s0, v4, 31
+; GFX1132GISEL-NEXT:    v_readlane_b32 s1, v5, 31
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s2
+; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1132GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
+; GFX1132GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
+; GFX1132GISEL-NEXT:    scratch_load_b32 v4, off, s32
+; GFX1132GISEL-NEXT:    scratch_load_b32 v5, off, s32 offset:4
+; GFX1132GISEL-NEXT:    scratch_load_b32 v6, off, s32 offset:8
+; GFX1132GISEL-NEXT:    scratch_load_b32 v7, off, s32 offset:12
+; GFX1132GISEL-NEXT:    s_mov_b32 exec_lo, s0
+; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX1132GISEL-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %result = call i64 @llvm.amdgcn.wave.reduce.xor.i64(i64 %in, i32 2)
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
 define amdgpu_kernel void @default_stratergy(ptr addrspace(1) %out) {
 ; GFX8DAGISEL-LABEL: default_stratergy:
 ; GFX8DAGISEL:       ; %bb.0: ; %entry
@@ -1304,7 +2156,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1313,24 +2165,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX8DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX8DAGISEL-NEXT:  ; %bb.5:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX8DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX8DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1345,7 +2197,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1354,20 +2206,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX8GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX8GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX8GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX8GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX8GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
@@ -1383,7 +2235,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1392,24 +2244,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX9DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX9DAGISEL-NEXT:  ; %bb.5:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX9DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1423,7 +2275,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1432,20 +2284,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX9GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX9GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX9GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX9GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX9GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1460,7 +2312,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1469,24 +2321,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1064DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1064DAGISEL-NEXT:  ; %bb.5:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1064DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1064DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1500,7 +2352,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_load_dword s6, s[4:5], 0x2c
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1509,20 +2361,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1064GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1064GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1064GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1064GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1064GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1064GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1537,7 +2389,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1546,24 +2398,24 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1032DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1032DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1032DAGISEL-NEXT:  ; %bb.5:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1032DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1032DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1577,7 +2429,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1586,20 +2438,20 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1032GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1032GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1032GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1032GISEL-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1032GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1032GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
@@ -1616,7 +2468,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1626,25 +2478,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s2, s6, s2
-; GFX1164DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164DAGISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1164DAGISEL-NEXT:  ; %bb.5:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
-; GFX1164DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1164DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1660,7 +2512,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_load_b32 s6, s[4:5], 0x2c
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
@@ -1670,21 +2522,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1164GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s6, s2
-; GFX1164GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
-; GFX1164GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
 ; GFX1164GISEL-NEXT:    s_xor_b32 s6, s6, s8
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1164GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1164GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -1701,7 +2553,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1711,25 +2563,25 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132DAGISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s1, s1, s2
-; GFX1132DAGISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB5_6
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB6_6
 ; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
-; GFX1132DAGISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB5_4
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB6_4
 ; GFX1132DAGISEL-NEXT:  ; %bb.5:
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX1132DAGISEL-NEXT:  .LBB5_6: ; %endif
+; GFX1132DAGISEL-NEXT:  .LBB6_6: ; %endif
 ; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
@@ -1745,7 +2597,7 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
@@ -1755,21 +2607,21 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
 ; GFX1132GISEL-NEXT:    s_and_b32 s2, s2, 1
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s0, s0, s2
-; GFX1132GISEL-NEXT:  .LBB5_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB6_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB5_5
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB6_5
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
 ; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX1132GISEL-NEXT:  .LBB5_4: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB6_4: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB5_4
-; GFX1132GISEL-NEXT:  .LBB5_5: ; %endif
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB6_4
+; GFX1132GISEL-NEXT:  .LBB6_5: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
@@ -1991,14 +2843,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8DAGISEL-NEXT:  ; %bb.2:
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2011,14 +2863,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX8GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX8GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX8GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX8GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX8GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX8GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX8GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX8GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8GISEL-NEXT:  ; %bb.2:
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2031,14 +2883,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9DAGISEL-NEXT:  ; %bb.2:
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2051,14 +2903,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX9GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX9GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX9GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX9GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX9GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX9GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX9GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9GISEL-NEXT:  ; %bb.2:
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2071,14 +2923,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064DAGISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064DAGISEL-NEXT:  ; %bb.2:
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2090,14 +2942,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1064GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1064GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1064GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s10, s[6:7]
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v2, s10
 ; GFX1064GISEL-NEXT:    v_readlane_b32 s9, v3, s10
 ; GFX1064GISEL-NEXT:    s_bitset0_b64 s[6:7], s10
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064GISEL-NEXT:  ; %bb.2:
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2109,14 +2961,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032DAGISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032DAGISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032DAGISEL-NEXT:  ; %bb.2:
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2128,14 +2980,14 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1032GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
-; GFX1032GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1032GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s7, s6
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s8, v2, s7
 ; GFX1032GISEL-NEXT:    v_readlane_b32 s9, v3, s7
 ; GFX1032GISEL-NEXT:    s_bitset0_b32 s6, s7
 ; GFX1032GISEL-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032GISEL-NEXT:  ; %bb.2:
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -2147,7 +2999,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164DAGISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2155,7 +3007,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164DAGISEL-NEXT:  ; %bb.2:
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2167,7 +3019,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
-; GFX1164GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1164GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s6, s[2:3]
 ; GFX1164GISEL-NEXT:    v_readlane_b32 s4, v2, s6
@@ -2175,7 +3027,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s6
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
-; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164GISEL-NEXT:  ; %bb.2:
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v2, s0
@@ -2187,7 +3039,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132DAGISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132DAGISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132DAGISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2195,7 +3047,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132DAGISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132DAGISEL-NEXT:  ; %bb.2:
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132DAGISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2206,7 +3058,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
-; GFX1132GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
+; GFX1132GISEL-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
 ; GFX1132GISEL-NEXT:    v_readlane_b32 s4, v2, s3
@@ -2214,7 +3066,7 @@ define void @divergent_value_i64(ptr addrspace(1) %out, i64 %id.x) {
 ; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
 ; GFX1132GISEL-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132GISEL-NEXT:  ; %bb.2:
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1132GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -2234,7 +3086,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8DAGISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2242,7 +3094,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX8DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX8DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -2270,7 +3122,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX8GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX8GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX8GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2278,10 +3130,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX8GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX8GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX8GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX8GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX8GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX8GISEL-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2290,7 +3142,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8GISEL-NEXT:    s_mul_i32 s6, s4, s7
 ; GFX8GISEL-NEXT:    s_mul_i32 s7, s5, s7
-; GFX8GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX8GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2307,7 +3159,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX9DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -2315,7 +3167,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX9DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX9DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -2342,7 +3194,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX9GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX9GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX9GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX9GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2350,10 +3202,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX9GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX9GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX9GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX9GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX9GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX9GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX9GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -2362,7 +3214,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX9GISEL-NEXT:    s_mul_i32 s7, s7, s4
-; GFX9GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX9GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2379,7 +3231,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064DAGISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064DAGISEL-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
@@ -2387,7 +3239,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX1064DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -2414,7 +3266,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GFX1064GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1064GISEL-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
@@ -2422,10 +3274,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1064GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1064GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
-; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1064GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1064GISEL-NEXT:    s_mov_b64 s[4:5], exec
@@ -2434,7 +3286,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064GISEL-NEXT:    s_mul_i32 s6, s6, s4
 ; GFX1064GISEL-NEXT:    s_mul_i32 s7, s7, s4
-; GFX1064GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1064GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2451,7 +3303,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032DAGISEL-NEXT:    s_mov_b32 s4, exec_lo
 ; GFX1032DAGISEL-NEXT:    s_bcnt1_i32_b32 s4, s4
@@ -2459,7 +3311,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s4, s2, s5
 ; GFX1032DAGISEL-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -2486,7 +3338,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr6_sgpr7
 ; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s8, vcc_lo
 ; GFX1032GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1032GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1032GISEL-NEXT:    s_bcnt1_i32_b32 s6, s6
@@ -2494,10 +3346,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1032GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1032GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1032GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s2, s8
-; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1032GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -2506,7 +3358,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032GISEL-NEXT:    s_mul_i32 s6, s6, s3
 ; GFX1032GISEL-NEXT:    s_mul_i32 s7, s7, s3
-; GFX1032GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1032GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2525,7 +3377,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1164DAGISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164DAGISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2534,7 +3386,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1164DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1164DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[2:3], s[8:9]
 ; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s6
@@ -2565,7 +3417,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1164GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2574,10 +3426,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1164GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1164GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1164GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[2:3], s[8:9]
-; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1164GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1164GISEL-NEXT:    s_mov_b64 s[6:7], exec
@@ -2587,7 +3439,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164GISEL-NEXT:    s_mul_i32 s6, s4, s7
 ; GFX1164GISEL-NEXT:    s_mul_i32 s7, s5, s7
-; GFX1164GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1164GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, s7
@@ -2606,7 +3458,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
 ; GFX1132DAGISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132DAGISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2615,7 +3467,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1132DAGISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1132DAGISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132DAGISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s2, s8
 ; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -2645,7 +3497,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
 ; GFX1132GISEL-NEXT:    s_xor_b32 s8, exec_lo, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_2
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
 ; GFX1132GISEL-NEXT:    s_mov_b32 s6, exec_lo
 ; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -2654,10 +3506,10 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s2, s7
 ; GFX1132GISEL-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1132GISEL-NEXT:  .LBB8_2: ; %Flow
+; GFX1132GISEL-NEXT:  .LBB9_2: ; %Flow
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s2, s8
-; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB8_4
+; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB9_4
 ; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
 ; GFX1132GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
 ; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
@@ -2667,7 +3519,7 @@ define amdgpu_kernel void @divergent_cfg_i64(ptr addrspace(1) %out, i64 %in, i64
 ; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132GISEL-NEXT:    s_mul_i32 s6, s4, s3
 ; GFX1132GISEL-NEXT:    s_mul_i32 s7, s5, s3
-; GFX1132GISEL-NEXT:  .LBB8_4: ; %endif
+; GFX1132GISEL-NEXT:  .LBB9_4: ; %endif
 ; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s2
 ; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1132GISEL-NEXT:    v_mov_b32_e32 v2, 0



More information about the llvm-branch-commits mailing list